In [1]:
from collections import Counter

from sklearn.datasets import load_iris
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from imblearn.datasets import make_imbalance
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import spacy

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
train = pd.read_csv('train.csv')

In [5]:
train['doc'] = [nlp(text) for text in train.message]

In [6]:
train['num_tokens'] = [len(token) for token in train['doc']]

In [7]:
train['countDict'] = train['doc'].apply(lambda x: [Counter(token.pos_ for token in x)])

In [8]:
train['Adj'] = train['countDict'].apply(lambda x: (x[0]['ADJ']))/train['num_tokens']*100
train['Noun'] = train['countDict'].apply(lambda x: (x[0]['NOUN']))/train['num_tokens']*100
train['Punct'] = train['countDict'].apply(lambda x: (x[0]['PUNCT']))/train['num_tokens']*100
train['Verb'] = train['countDict'].apply(lambda x: (x[0]['VERB']))/train['num_tokens']*100

In [18]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
train['Polarity'] = train['message'].apply(lambda x: sid.polarity_scores(x) )

In [19]:
train.head()

Unnamed: 0,sentiment,message,tweetid,doc,num_tokens,countDict,Adj,Noun,Punct,Verb,Polarity
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,"(PolySciMajor, EPA, chief, does, n't, think, c...",24,"[{'PROPN': 2, 'NOUN': 6, 'VERB': 4, 'ADV': 1, ...",12.5,25.0,16.666667,16.666667,"{'neg': 0.0, 'neu': 0.905, 'pos': 0.095, 'comp..."
1,1,It's not like we lack evidence of anthropogeni...,126103,"(It, 's, not, like, we, lack, evidence, of, an...",11,"[{'PRON': 2, 'VERB': 2, 'ADV': 1, 'ADP': 2, 'N...",18.181818,18.181818,0.0,18.181818,"{'neg': 0.167, 'neu': 0.552, 'pos': 0.281, 'co..."
2,2,RT @RawStory: Researchers say we have three ye...,698562,"(RT, @RawStory, :, Researchers, say, we, have,...",22,"[{'PROPN': 3, 'PUNCT': 2, 'NOUN': 4, 'VERB': 4...",4.545455,18.181818,9.090909,18.181818,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,"(#, TodayinMaker, #, WIRED, :, 2016, was, a, p...",17,"[{'SYM': 1, 'PROPN': 3, 'PUNCT': 1, 'NUM': 1, ...",5.882353,23.529412,5.882353,11.764706,"{'neg': 0.245, 'neu': 0.755, 'pos': 0.0, 'comp..."
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,"(RT, @SoyNovioDeTodas, :, It, 's, 2016, ,, and...",25,"[{'PROPN': 1, 'VERB': 5, 'PUNCT': 5, 'PRON': 1...",4.0,24.0,20.0,20.0,"{'neg': 0.299, 'neu': 0.701, 'pos': 0.0, 'comp..."


In [20]:
train['compound']  = train['Polarity'].apply(lambda score_dict: score_dict['compound'])

In [21]:
train.head()

Unnamed: 0,sentiment,message,tweetid,doc,num_tokens,countDict,Adj,Noun,Punct,Verb,Polarity,compound
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,"(PolySciMajor, EPA, chief, does, n't, think, c...",24,"[{'PROPN': 2, 'NOUN': 6, 'VERB': 4, 'ADV': 1, ...",12.5,25.0,16.666667,16.666667,"{'neg': 0.0, 'neu': 0.905, 'pos': 0.095, 'comp...",0.2244
1,1,It's not like we lack evidence of anthropogeni...,126103,"(It, 's, not, like, we, lack, evidence, of, an...",11,"[{'PRON': 2, 'VERB': 2, 'ADV': 1, 'ADP': 2, 'N...",18.181818,18.181818,0.0,18.181818,"{'neg': 0.167, 'neu': 0.552, 'pos': 0.281, 'co...",0.1159
2,2,RT @RawStory: Researchers say we have three ye...,698562,"(RT, @RawStory, :, Researchers, say, we, have,...",22,"[{'PROPN': 3, 'PUNCT': 2, 'NOUN': 4, 'VERB': 4...",4.545455,18.181818,9.090909,18.181818,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,"(#, TodayinMaker, #, WIRED, :, 2016, was, a, p...",17,"[{'SYM': 1, 'PROPN': 3, 'PUNCT': 1, 'NUM': 1, ...",5.882353,23.529412,5.882353,11.764706,"{'neg': 0.245, 'neu': 0.755, 'pos': 0.0, 'comp...",-0.5994
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,"(RT, @SoyNovioDeTodas, :, It, 's, 2016, ,, and...",25,"[{'PROPN': 1, 'VERB': 5, 'PUNCT': 5, 'PRON': 1...",4.0,24.0,20.0,20.0,"{'neg': 0.299, 'neu': 0.701, 'pos': 0.0, 'comp...",-0.7506


In [None]:
df = train[['num_tokens','Adj','Noun','Punct','Verb','compound','sentiment']]

In [69]:
df = train[['Verb','sentiment']]

In [76]:
df.head()

Unnamed: 0,Verb,sentiment
0,16.666667,1
1,18.181818,1
2,18.181818,2
3,11.764706,1
4,20.0,1


In [77]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [79]:
pipeline = make_pipeline(NearMiss(version=2),
                         LinearSVC())
pipeline.fit(X_train, y_train)



Pipeline(steps=[('nearmiss', NearMiss(version=2)), ('linearsvc', LinearSVC())])

In [80]:
from sklearn import metrics

# Create a prediction set:
predictions = pipeline.predict(X_test)

# Print a confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

[[ 261    0   86   54]
 [ 417    0  154   95]
 [1605    0  630  363]
 [ 841    0  114  126]]


In [81]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.08      0.65      0.15       401
           0       0.00      0.00      0.00       666
           1       0.64      0.24      0.35      2598
           2       0.20      0.12      0.15      1081

    accuracy                           0.21      4746
   macro avg       0.23      0.25      0.16      4746
weighted avg       0.40      0.21      0.24      4746



  _warn_prf(average, modifier, msg_start, len(result))


In [82]:
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

                   pre       rec       spe        f1       geo       iba       sup

         -1       0.08      0.65      0.34      0.15      0.47      0.23       401
          0       0.00      0.00      1.00      0.00      0.00      0.00       666
          1       0.64      0.24      0.84      0.35      0.45      0.19      2598
          2       0.20      0.12      0.86      0.15      0.32      0.09      1081

avg / total       0.40      0.21      0.82      0.24      0.36      0.14      4746



  _warn_prf(average, modifier, msg_start, len(result))
