# Imports

In [1]:
import pandas as pd
import spacy
import string
import numpy as np
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report

ModuleNotFoundError: No module named 'spacy'

In [None]:
ps = nltk.PorterStemmer()
stop_words=stopwords.words('english')
print(stop_words)

# HelperFunctions

In [None]:
class spacy_ops_en(object):
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
    def __call__(self, some_text):
        doc = self.nlp(some_text)
        return [token.text for token in doc]

# Model

In [None]:
text_clf = Pipeline([
    ('vect',CountVectorizer(
    ngram_range=(3,3),
    max_features=1000,
    lowercase=False,
    binary=True,
    tokenizer=spacy_ops_en()
    )),
   
    
    ('clf', LogisticRegression(
        random_state = 1,
        solver = 'saga',
        multi_class= 'ovr',
        max_iter=10000
    ))
    
   
])


# Data

In [None]:
data_source_url = "https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv"
airline_tweets = pd.read_csv(data_source_url)
airline_tweets.head()

In [None]:
CLASS_SIZE= min(Counter(airline_tweets.airline_sentiment).values())
airline_tweets = pd.concat([airline_tweets[airline_tweets['airline_sentiment']=="positive"][:CLASS_SIZE],
                  airline_tweets[airline_tweets['airline_sentiment']=="negative"][:CLASS_SIZE],
                  airline_tweets[airline_tweets['airline_sentiment']=="neutral"][:CLASS_SIZE]])
 
print(Counter(airline_tweets.airline_sentiment))

In [None]:
features = airline_tweets["text"].values
labels = airline_tweets["airline_sentiment"].values

assert len(features)==len(labels)

#datasplit, train and test sets
from sklearn.model_selection import train_test_split 
X_train, X_test, Y_train, Y_test  = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=882)

In [None]:
print(X_train[0])
print(X_test[0])
print(Y_train[0])
print(Y_test[0])


# Training

In [None]:
text_clf.fit(X_train,Y_train)

In [None]:
predictions = text_clf.predict(X_test)

In [None]:
print(classification_report(y_true=Y_test, y_pred=predictions))

In [None]:
print("\n\nFEATURES\n\n")
clf_features = np.array(text_clf.get_params()['vect'].get_feature_names())
clf_coeffs_headline = text_clf.get_params()['clf'].coef_[2]
clf_coeffs_domestic = text_clf.get_params()['clf'].coef_[1]
clf_coeffs_international = text_clf.get_params()['clf'].coef_[0]
k = 25
highest_headline_features = clf_coeffs_headline.argsort()[-k:][::-1]
highest_domestic_features = clf_coeffs_domestic.argsort()[-k:][::-1]
highest_international_features = clf_coeffs_international.argsort()[-k:][::-1]
print("\n{} Strongest feats for marking something 'Positive'\n\n".format(k))
for rank, feat in enumerate(clf_features[highest_headline_features]):
    print(feat)
    
print("\n{} Strongest feats for marking something 'Neutral'\n\n".format(k))
for rank, feat in enumerate(clf_features[highest_domestic_features]):
    print(feat)
    
print("\n{} Strongest feats for marking something 'Negative'\n\n".format(k))
for rank, feat in enumerate(clf_features[highest_international_features]):
    print(feat)