In [0]:
import pandas as p
import numpy as np
import itertools
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
import re

In [0]:
def getCleanTweet(text):
    tok = WordPunctTokenizer()
    user_pattern = '@[A-Za-z0-9_]+'
    http_pattern = 'https?://[^ ]+'
    www_pattern = 'www.[^ ]+'
    combined_pattern = '|'.join((user_pattern, http_pattern, www_pattern))
    negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                    "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                    "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                    "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                    "mustn't":"must not"}
    neg_pattern = re.compile('\b(' + '|'.join(negations_dic.keys()) + ')\b')
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(combined_pattern, '', bom_removed)
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], stripped)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled).lower()
    cleaned = (" ".join(x for x in tok.tokenize(letters_only) if len(x) > 1)).strip()
    cleaned = ''.join(k + k if sum(1 for i in g) > 1 else k for k, g in itertools.groupby(cleaned))
    return cleaned

In [3]:
data = p.read_csv("tweet.csv", encoding="latin-1", names=["target", "ids", "date", "flag", "user", "text"])
data['clean_text'] = [getCleanTweet(text) for text in data['text']]
data.dropna(inplace=True)
data.reset_index(drop=True,inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 7 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   target      1600000 non-null  int64 
 1   ids         1600000 non-null  int64 
 2   date        1600000 non-null  object
 3   flag        1600000 non-null  object
 4   user        1600000 non-null  object
 5   text        1600000 non-null  object
 6   clean_text  1600000 non-null  object
dtypes: int64(2), object(5)
memory usage: 85.4+ MB


In [0]:
x_values = data['clean_text'].values
y_values = data['target'].values
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_values, y_values, test_size=0.01, random_state=0)

# Create 4 models - 
# tf unigram,  
# tf idf unigram, 
# tf unigram+bigram, 
# tf idf unigram+bigram
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tf_unigram = CountVectorizer(encoding='latin-1',binary=False,min_df = 5 )
tf_idf_unigram = TfidfVectorizer(encoding='latin-1',use_idf=True,min_df = 5)
tf_uni_bi = CountVectorizer(encoding='latin-1',binary=False, ngram_range=(1,2),min_df = 5)
tf_idf_uni_bi = TfidfVectorizer(encoding='latin-1',use_idf=True, ngram_range=(1,2),min_df = 5)
tf_idf_bi = TfidfVectorizer(encoding='latin-1',use_idf=True, ngram_range=(2,2),min_df = 5)

In [0]:
def show_most_and_least_informative_features(vectorizer, clf, n=10):
    feature_names = vectorizer.get_feature_names()
    importances = list(clf.feature_importances_)
    coefs_with_fns = sorted(zip(importances, feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[-n:])
    print("Top ", n, " most and least informative features")
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [30]:
#tf_unigram
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import numpy as np
SVCpipe = Pipeline([('SVC',LinearSVC())])

from sklearn.svm import LinearSVC as svm
x_train_vec_tf = tf_unigram.fit_transform(x_train)
x_test_vec_tf = tf_unigram.transform(x_test)

# Gridsearch to determine the value of C
param_grid = {'SVC__C':(0.85,1)}
linearSVC = GridSearchCV(SVCpipe,param_grid,cv=3,return_train_score=True)
linearSVC.fit(x_train_vec_tf,y_train)
print(linearSVC.best_params_)
#linearSVC.coef_
#linearSVC.intercept_

bestlinearSVC = linearSVC.best_estimator_
bestlinearSVC.fit(x_train_vec_tf,y_train)
bestlinearSVC.coef_ = bestlinearSVC.named_steps['SVC'].coef_

score_tf_unigram = bestlinearSVC.score(x_test_vec_tf,y_test)
print(bestlinearSVC.score(x_train_vec_tf,y_train))
print(bestlinearSVC.score(x_test_vec_tf,y_test))
#show_most_and_least_informative_features(tf_unigram, svm, n=10)




{'SVC__C': 0.85}
0.8117638888888888
0.799


In [7]:
#IDF-Unigram+Bigram
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import numpy as np
SVCpipe = Pipeline([('SVC',LinearSVC())])

from sklearn.svm import LinearSVC as svm
x_train_vec_idf_uni_bi = tf_idf_uni_bi.fit_transform(x_train)
x_test_vec_idf_uni_bi = tf_idf_uni_bi.transform(x_test)

# Gridsearch to determine the value of C
param_grid = {'SVC__C':(1,0.85)}
linearSVC = GridSearchCV(SVCpipe,param_grid,cv=3,return_train_score=True)
linearSVC.fit(x_train_vec_idf_uni_bi,y_train)
print(linearSVC.best_params_)

bestlinearSVC = linearSVC.best_estimator_
bestlinearSVC.fit(x_train_vec_idf_uni_bi,y_train)
bestlinearSVC.coef_ = bestlinearSVC.named_steps['SVC'].coef_

score_idf_unibi = bestlinearSVC.score(x_test_vec_idf_uni_bi,y_test)

print(bestlinearSVC.score(x_train_vec_idf_uni_bi,y_train))
print(bestlinearSVC.score(x_test_vec_idf_uni_bi,y_test))
#

{'SVC__C': 0.85}
0.8864646464646465
0.818


In [8]:
#IDF Unigram
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import numpy as np
SVCpipe = Pipeline([('SVC',LinearSVC())])

from sklearn.svm import LinearSVC as svm
x_train_vec_idf = tf_idf_unigram.fit_transform(x_train)
x_test_vec_idf = tf_idf_unigram.transform(x_test)


# Gridsearch to determine the value of C
param_grid = {'SVC__C':(0.85,1)}
linearSVC = GridSearchCV(SVCpipe,param_grid,cv=3,return_train_score=True)
linearSVC.fit(x_train_vec_idf,y_train)
print(linearSVC.best_params_)
#linearSVC.coef_
#linearSVC.intercept_

bestlinearSVC = linearSVC.best_estimator_
bestlinearSVC.fit(x_train_vec_idf,y_train)
bestlinearSVC.coef_ = bestlinearSVC.named_steps['SVC'].coef_

score_idf_uni = bestlinearSVC.score(x_test_vec_idf,y_test)

print(bestlinearSVC.score(x_train_vec_idf,y_train))
print(bestlinearSVC.score(x_test_vec_idf,y_test))

#show_most_and_least_informative_features(tf_idf_unigram, svm, n=10)

{'SVC__C': 0.85}
0.8124450757575757
0.800375


In [9]:
#TF Unigram+Bigram
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import numpy as np
SVCpipe = Pipeline([('SVC',LinearSVC())])
from sklearn.svm import LinearSVC as svm

x_train_vec_tf_uni_bi = tf_uni_bi.fit_transform(x_train)
x_test_vec_tf_uni_bi = tf_uni_bi.transform(x_test)

# Gridsearch to determine the value of C
param_grid = {'SVC__C':(1,0.85)}
linearSVC = GridSearchCV(SVCpipe,param_grid,cv=3,return_train_score=True)
linearSVC.fit(x_train_vec_tf_uni_bi,y_train)
print(linearSVC.best_params_)
#linearSVC.coef_
#linearSVC.intercept_

bestlinearSVC = linearSVC.best_estimator_
bestlinearSVC.fit(x_train_vec_tf_uni_bi,y_train)
bestlinearSVC.coef_ = bestlinearSVC.named_steps['SVC'].coef_

score_tf_unibi = bestlinearSVC.score(x_test_vec_tf_uni_bi,y_test)

print(bestlinearSVC.score(x_train_vec_tf_uni_bi,y_train))
print(bestlinearSVC.score(x_test_vec_tf_uni_bi,y_test))

#show_most_and_least_informative_features(tf_uni_bi, svm, n=10)



{'SVC__C': 0.85}
0.9055694444444444
0.7959375


In [31]:
import pandas as pd
scores = [score_tf_unibi, score_idf_uni, score_idf_unibi, score_tf_unigram]
names = ['TF+unigram_Bigram', 'TFIDF+Unigram', 'TFIDF+Unigram+Bigram', 'TF+Unigram']
lst = pd.DataFrame(zip(names,scores),columns = ['Vectorization','Score'])
lst

Unnamed: 0,Vectorization,Score
0,TF+unigram_Bigram,0.795937
1,TFIDF+Unigram,0.800375
2,TFIDF+Unigram+Bigram,0.818
3,TF+Unigram,0.799


In [32]:
#Random Forest
#TF-Unigram
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [90],
    'max_features': ['auto'],
    'min_samples_leaf': [4],
    'n_estimators': [200]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 2, n_jobs = -1, verbose = 2)

grid_search.fit(x_train_vec_tf,y_train)
score_rf_tf_unigram = grid_search.score(x_test_vec_tf,y_test)
grid_search.score(x_test_vec_tf,y_test)


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 24.5min finished


0.7673125

In [21]:
#TFIDF Unigram+Bigram

param_grid = {
    'bootstrap': [True],
    'max_depth': [90],
    'max_features': ['auto'],
    'min_samples_leaf': [4],
    'n_estimators': [200]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 2, n_jobs = -1, verbose = 2)

grid_search.fit(x_train_vec_idf_uni_bi,y_train)
score_rf_tfidf_unibi = grid_search.score(x_test_vec_idf_uni_bi,y_test)
grid_search.score(x_test_vec_idf_uni_bi,y_test)


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 36.9min finished


0.765625

In [22]:
#TFIDF Unigram

param_grid = {
    'bootstrap': [True],
    'max_depth': [90],
    'max_features': ['auto'],
    'min_samples_leaf': [4],
    'n_estimators': [200]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 2, n_jobs = -1, verbose = 2)

grid_search.fit(x_train_vec_idf,y_train)
score_rf_tfidf_uni = grid_search.score(x_test_vec_idf,y_test)
grid_search.score(x_test_vec_idf,y_test)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 25.2min finished


0.771125

In [23]:
#TF Unigram+Bigram

param_grid = {
    'bootstrap': [True],
    'max_depth': [90],
    'max_features': ['auto'],
    'min_samples_leaf': [4],
    'n_estimators': [200]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 2, n_jobs = -1, verbose = 2)

grid_search.fit(x_train_vec_tf_uni_bi,y_train)
score_rf_tf_uni_bi = grid_search.score(x_test_vec_tf_uni_bi,y_test)
grid_search.score(x_test_vec_tf_uni_bi,y_test)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 35.5min finished


0.767625

In [33]:
score_rf = [score_rf_tf_uni_bi, score_rf_tf_unigram, score_rf_tfidf_uni, score_rf_tfidf_unibi]
names = ["Tf+Unigram+Bigram","TF+Unigram","TFIDF+Unigram","TFIDF+Uni+Bigram"]

lst = pd.DataFrame(zip(names,score_rf),columns=["Vectorization","Score"], index=[1,2,3,4])
lst

Unnamed: 0,Vectorization,Score
1,Tf+Unigram+Bigram,0.767625
2,TF+Unigram,0.767312
3,TFIDF+Unigram,0.771125
4,TFIDF+Uni+Bigram,0.765625
