In [19]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from helpers import preprocessing_pipeline, count_syntactic_features

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
raw_training_data = pd.read_csv("data/train.csv")
training_data = preprocessing_pipeline(raw_training_data)

In [3]:
tweet_tokenizer = TweetTokenizer()
def tokenize(tweet):
    return tweet_tokenizer.tokenize(tweet)
vectorizer = TfidfVectorizer(tokenizer=tokenize)
bag_of_words = vectorizer.fit_transform(training_data["tweets"])

In [4]:
X = bag_of_words
y = training_data["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [5]:
models_params = [
        (
            RandomForestClassifier(random_state=1337),{
                "class_weight" : [ "balanced"], # None
                "min_samples_leaf" : [i for i in range(1, 61, 10)],
                "n_estimators" : [i for i in range(5, 20, 5)],
                "n_jobs" : [7]
            }
    ),
        (
            LinearSVC(random_state=1337),{
                "C" : [0.5, 1],
                "class_weight" : ["balanced"],
            }
    ),
        (
            LogisticRegression(random_state=1337),{
                "max_iter": [125, 150],
                "class_weight" : [None,"balanced"],
                "n_jobs" : [7],
    })
]

In [6]:
def do_gridsearch(X_train, y_train, models_params):
    results = pd.DataFrame()
    for model, param_grid in models_params:
        gs = GridSearchCV(estimator=model, 
                            error_score='raise',
                            param_grid=param_grid,
                            scoring='recall')
        gs.fit(X=X_train, y=y_train)
        results = pd.concat([results , pd.DataFrame([
    {
    'model_type' : model,
    'parameters' : params,
    'score' : score,
    }
    for params, score in zip(gs.cv_results_["params"],gs.cv_results_["mean_test_score"],
    )])])
    return results.sort_values(by='score', ascending=False)


In [7]:
tfidf_model_results = do_gridsearch(X_train, y_train, models_params)
tfidf_model_results

Unnamed: 0,model_type,parameters,score
2,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.939795
1,LogisticRegression(random_state=1337),"{'class_weight': None, 'max_iter': 150, 'n_job...",0.937625
0,LogisticRegression(random_state=1337),"{'class_weight': None, 'max_iter': 125, 'n_job...",0.937625
1,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.908659
0,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.90208
0,LinearSVC(random_state=1337),"{'C': 0.5, 'class_weight': 'balanced'}",0.825515
1,LinearSVC(random_state=1337),"{'C': 1, 'class_weight': 'balanced'}",0.823861
2,LogisticRegression(random_state=1337),"{'class_weight': 'balanced', 'max_iter': 125, ...",0.812668
3,LogisticRegression(random_state=1337),"{'class_weight': 'balanced', 'max_iter': 150, ...",0.812668
5,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.786457


{'class_weight': 'balanced',
 'min_samples_leaf': 1,
 'n_estimators': 15,
 'n_jobs': 7}

In [8]:
new_columns = pd.DataFrame(columns=["neg", "neu", "pos", "compound", 'Stopwords', 'Nouns', 'Verbs', 'Adverbs', 'Adjectives', 'Pronouns', "length"])
training_data = training_data.join(new_columns)

for index, row in training_data.iterrows():
    tweet = row[0]

    scores = SentimentIntensityAnalyzer().polarity_scores(tweet)

    for sentiment, score in scores.items():
        training_data.loc[index, sentiment] = score
        
    syntax_counts = count_syntactic_features(tweet)
    for syntax, count in syntax_counts.items():
        training_data.loc[index, syntax] = count
        
    training_data.loc[index, "length"] = len(tweet)

training_data = training_data.drop_duplicates()
training_data = training_data.reset_index(drop=True)
training_data.head()

Unnamed: 0,tweets,class,neg,neu,pos,compound,Stopwords,Nouns,Verbs,Adverbs,Adjectives,Pronouns,length
0,fav moment in sepp blatter vid ( 0:20 ) : `` w...,1,0.0,0.778,0.222,0.6908,10,5,1,1,2,0,116
1,just found this while walking my human ....,1,0.0,1.0,0.0,0.0,4,2,2,0,0,0,43
2,'disrespected the wife of prophet ' - pseudo l...,1,0.217,0.652,0.13,-0.296,3,6,2,0,0,0,80
3,do you know that super yeay satisfying feeling...,1,0.0,0.704,0.296,0.8126,11,3,6,1,1,0,120
4,if you 're going to call someone ignorant and ...,1,0.234,0.766,0.0,-0.6705,9,3,4,1,3,0,104


In [9]:
processed_columns = training_data.columns
processed_training_data = training_data.copy()
bag_of_words = vectorizer.fit_transform(training_data["tweets"])
processed_training_data.drop(columns=["tweets", "class"], inplace=True)
sparse_training_data = csr_matrix(processed_training_data.to_numpy(dtype=np.float32))

In [10]:
X = hstack([bag_of_words, sparse_training_data])
y = training_data["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=5432)
bow_features_model_results = do_gridsearch(X_train, y_train, models_params)
bow_features_model_results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,model_type,parameters,score
2,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.934639
0,LinearSVC(random_state=1337),"{'C': 0.5, 'class_weight': 'balanced'}",0.924277
1,LinearSVC(random_state=1337),"{'C': 1, 'class_weight': 'balanced'}",0.923237
1,LogisticRegression(random_state=1337),"{'class_weight': None, 'max_iter': 150, 'n_job...",0.91939
0,LogisticRegression(random_state=1337),"{'class_weight': None, 'max_iter': 125, 'n_job...",0.919286
1,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.897834
0,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.893225
2,LogisticRegression(random_state=1337),"{'class_weight': 'balanced', 'max_iter': 125, ...",0.746734
3,LogisticRegression(random_state=1337),"{'class_weight': 'balanced', 'max_iter': 150, ...",0.741778
5,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.738728


In [11]:
bi_vectorizer = TfidfVectorizer(tokenizer=tokenize, 
                                ngram_range=(2, 2), 
                                max_features=50000)
bigram = vectorizer.fit_transform(training_data["tweets"])

In [12]:
X = hstack([bag_of_words, bigram, sparse_training_data])
y = training_data["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=99101)
bow_features_model_results = do_gridsearch(X_train, y_train, models_params)
bow_features_model_results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,model_type,parameters,score
2,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.932793
1,LogisticRegression(random_state=1337),"{'class_weight': None, 'max_iter': 150, 'n_job...",0.924142
0,LogisticRegression(random_state=1337),"{'class_weight': None, 'max_iter': 125, 'n_job...",0.919885
1,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.90317
0,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.898325
1,LinearSVC(random_state=1337),"{'C': 1, 'class_weight': 'balanced'}",0.823881
5,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.78035
4,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.775574
3,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.757751
7,RandomForestClassifier(random_state=1337),"{'class_weight': 'balanced', 'min_samples_leaf...",0.751765


In [65]:
raw_test_data = pd.read_csv("data/test.csv")

test_data = preprocessing_pipeline(raw_test_data)
y_test = test_data["class"]
training_data_0 = preprocessing_pipeline(raw_training_data)
whole_dataset = pd.concat([test_data, training_data_0])

tweet_tokenizer = TweetTokenizer()
def tokenize(tweet):
    return tweet_tokenizer.tokenize(tweet)

vectorizer = TfidfVectorizer(tokenizer=tokenize)

vectorizer.fit(whole_dataset["tweets"])
X_train = vectorizer.transform(training_data_0["tweets"])
y_train = training_data_0["class"]
model = RandomForestClassifier(class_weight="balanced",
                               min_samples_leaf=1,
                               n_estimators=15,
                               n_jobs=7,
                               random_state=99101)
model.fit(X_train, y_train)
y_pred = model.predict(vectorizer.transform(test_data["tweets"]))
print(classification_report(y_pred, y_test))



              precision    recall  f1-score   support

           0       0.52      0.81      0.63      1179
           1       0.95      0.82      0.88      4896

    accuracy                           0.82      6075
   macro avg       0.73      0.81      0.75      6075
weighted avg       0.86      0.82      0.83      6075

