In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
import time

In [3]:
X = np.genfromtxt('../data/X.csv', delimiter=',')
Y = np.genfromtxt('../data/Y.csv', delimiter=',')

In [4]:
X_comp, X_test, Y_comp, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
Xtr, Xva, Ytr, Yva = train_test_split(X_comp, Y_comp, test_size=0.2)

In [28]:
lr = LogisticRegression(C=2, penalty='l1')

mlp = MLPClassifier(max_iter=1000, hidden_layer_sizes=(100,), activation='identity', 
                            solver='sgd', alpha=0.01, learning_rate='adaptive')

gb = GradientBoostingClassifier(max_depth=3,loss="deviance",learning_rate=0.1,
                                 n_estimators=1500,min_samples_split=20,min_samples_leaf=9,
                                 max_features="sqrt", subsample=0.5)

In [33]:
combined = VotingClassifier(estimators=[('lr', lr), ('nn', mlp), ('gb', gb)],
                         voting='soft', flatten_transform=True) # weights=?

In [16]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

nltk.download('wordnet')

stemmer = WordNetLemmatizer()
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(post):
    return (stemmer.lemmatize(w, pos='v') for w in analyzer(post) if w not in stop_words)

vectorizer = CountVectorizer(stop_words='english', min_df=0.005, lowercase=True, decode_error='ignore', analyzer=stemmed_words)

[nltk_data] Downloading package wordnet to /Users/Shiqi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [49]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(vectorizer, lr)
pipe.fit(X, Y)



Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer=<function stemmed_words at 0x1a2b7fd320>,
                                 binary=False, decode_error='ignore',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=0.005,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('logisticregression',
                 LogisticRegression(C=2, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                            

In [12]:
import pandas as pd

data = pd.read_csv(r"train.csv", engine='python')

In [21]:
data2 = data.drop(data.columns[0], axis=1)
X = data2[data2.columns[1]]
Y = data2[data2.columns[0]]

In [50]:
import eli5
eli5.show_weights(lr, vec=vectorizer, top=20, target_names=['negative', 'positive'])

Weight?,Feature
+2.154,welcome
+2.097,followfriday
+1.718,thank
+1.285,www
+1.225,enjoy
+1.174,glad
+1.158,amaze
+1.134,awesome
+1.122,hi
+1.110,cute


In [57]:
tweet1 = "@Starrbby too bad I won't be around I lost my job and can't even pay my phone bill lmao aw shucks"
eli5.show_prediction(lr, tweet1, vec=vectorizer, target_names=['negative', 'positive'])

Contribution?,Feature
1.161,lose
1.141,bad
0.981,phone
0.389,even
0.099,pay
0.091,job
-0.181,around
-0.287,<BIAS>


In [59]:
tweet2 = "	@jdarter Oh! Haha... dude I dont really look at em unless someone says HEY I ADDED YOU. Sorry I'm so terrible at that. I need a pop up!"
eli5.show_prediction(lr, tweet2, vec=vectorizer, target_names=['negative', 'positive'])

Contribution?,Feature
1.661,sorry
0.529,dont
0.3,really
0.283,oh
0.28,someone
0.19,need
0.062,say
-0.122,look
-0.287,<BIAS>
-0.6,haha


In [42]:
pipe = make_pipeline(vectorizer, gb)
pipe.fit(X, Y)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer=<function stemmed_words at 0x1a2b7fd320>,
                                 binary=False, decode_error='ignore',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=0.005,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_patter...
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features='sqrt',
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
     

In [48]:
import eli5
eli5.show_weights(gb, vec=vectorizer, top=20, target_names=['negative', 'positive'])

Weight,Feature
0.0845  ± 0.1299,thank
0.0576  ± 0.1164,sad
0.0566  ± 0.1266,miss
0.0496  ± 0.1276,sorry
0.0280  ± 0.1070,wish
0.0265  ± 0.1142,love
0.0259  ± 0.0810,suck
0.0221  ± 0.1097,good
0.0174  ± 0.0986,bad
0.0172  ± 0.0835,great
