CURRENT STANDINGS : 330/860 !!!!

**KAGGLE COMPETITION : MACHINE LEARNING APPROACH**

**A. Preprocessing**
1. Import libraries
2. Import dataset
3. Basic data cleaning : get rid of null values, punctuation and html tags
4. NLP data preprocessing : remove stop words, stem/lemmatize words (+ remove repeating words)
5. Test-Train split
6. Vecorize data (fit on the train data and use on train+test data) with countvec or tfidfvec
7. Oversample data (deal with imbalanced data with SMOTE)

**B. Models**
Benchmark ML models (LogisticRegression, LinearSVC, Multi-class Naive Bayes, Random forest classifier...)

**C. Make prediction** 

Submit predictions and put my name on the leaderboard !!

In [41]:
import pandas as pd
import numpy as np
import regex as re
import os
import nltk
from imblearn.over_sampling import SMOTE
from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

def clean_reviews(review):
    #find and replace all punctuation with "" and all html tags with " " + lower
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    review = REPLACE_NO_SPACE.sub("", review.lower()) #replace specified characters with "" in the review.lower() text
    review = REPLACE_WITH_SPACE.sub(" ", review)
    return review

# def remove_repeating_words(review):
#     review1 = yield(gensim.utils.simple_preprocess(str(review),deacc=True))
#     return review1

def remove_stop_words(review):
    stop_words_list = ENGLISH_STOP_WORDS #stop_words.
    review = " ".join(word for word in review.split(" ") if word not in stop_words_list)
    return review

def stem_reviews(review):
    # PB : NEED TO INITIATE A NEW STEMMER FOR EACH ROW...
    stemmer = PorterStemmer()
    #separator.join(list of strings) => 1 long string
    #string.split(separator) => list of strings
    review = " ".join([stemmer.stem(word) for word in review.split(" ")])
    return review

def lemmatize_reviews(review):
    lemmatizer = WordNetLemmatizer()
    #separator.join(list of strings) => 1 long string
    #string.split(separator) => list of strings
    review = " ".join([lemmatizer.lemmatize(word) for word in review.split(" ")])
    return review

def oversample(data_list):
    [X_train, y_train, X_eval, y_eval] = data_list
    sm = SMOTE()
    X_train_sm, y_train_sm = sm.fit_resample(X_train,y_train)
    X_eval_sm, y_eval_sm = sm.fit_resample(X_eval,y_eval)
    assert len(y_train_sm.value_counts().unique())==1 and len(y_eval_sm.value_counts().unique())==1
    return [X_train_sm, y_train_sm,X_eval_sm, y_eval_sm]

class Preprocessor:

    def __init__(self, df, train = True, vec = None, nlp_method = "lemmatize", vec_name = "tfidf", stop_words = "english", ngram_range = (1,2)) -> None: # change vec_name to tfidf to see what changes !!!
        self.df = df
        self.train = train
        self.vec = vec
        assert (self.train == True and self.vec == None) or (self.train == False and self.vec != None)
        assert nlp_method in ["None", "stem","lemmatize"]
        self.nlp_method = nlp_method
        assert vec_name in ["binary","count","tfidf"]
        self.vec_name = vec_name
        self.stop_words = stop_words # USELESS IF REMOVED BEFORE !!! (in the basic cleaning...)
        self.ngram_range = ngram_range
        if self.train == True : #same as self.vec == None
            if self.vec_name == "tfidf":
                self.vec =  TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=self.ngram_range) #stop_words=self.stop_words,
            else:
                self.vec = CountVectorizer(binary=self.vec_name == "binary", ngram_range=self.ngram_range) # stop_words=self.stop_words,
        else:
            self.vec = vec
        self.train_df = self.test_df = None

    def basic_clean(self) -> None:
        print("   Dropping null values")
        self.df.dropna(inplace = True)
        print("   Getting rid of punctuation and HTML tags")
        self.df["Reviews"] = self.df["Reviews"].apply(lambda row : clean_reviews(row))
        print("   Removing repeating words")
        
        # if self.train: #no labels in the test data
        #     self.df["Sentiments"] = self.df["Sentiments"].apply(lambda row : 0 if row=="negative" else 1)

    def nlp_clean(self) -> None:
        print("   Removing stop words")
        self.df["Reviews"] = self.df["Reviews"].apply(lambda row : remove_stop_words(row))
        if self.nlp_method == "stem":
            print("   Stemming reviews")
            self.df["Reviews"] = self.df["Reviews"].apply(lambda row : stem_reviews(row))
        elif self.nlp_method == "lemmatize":
            print("   Lemmatizing reviews")
            self.df["Reviews"] = self.df["Reviews"].apply(lambda row : lemmatize_reviews(row))
        else:
            print("   No NLP cleaning")

    def train_eval_split(self, train_size = 0.8, random_state = 42) -> None:
        M = self.df.shape[0]
        self.train_df = self.df.sample(n=int(train_size*M), axis=0, random_state = random_state, ignore_index=True)
        self.eval_df = self.df[~self.df.index.isin(self.train_df.index)].reset_index(drop=True)

    def vectorize_train(self):
        print(f"   Vecorizing training data : {self.vec_name}")
        vec = self.vec
        train_reviews = self.train_df["Reviews"]
        eval_reviews = self.eval_df["Reviews"]
        vec.fit(train_reviews)
        X_train = vec.transform(train_reviews)
        X_eval = vec.transform(eval_reviews)
        return vec, X_train, X_eval

    def final_split(self):
        y_train = self.train_df["Sentiments"]
        y_eval = self.eval_df["Sentiments"]
        vec, X_train, X_eval = self.vectorize_train()
        return vec, [X_train, y_train, X_eval, y_eval]

    def vectorize_test(self):
        print(f"   Vecorizing test data : {self.vec_name}")
        vec = self.vec
        vec_reviews = vec.transform(self.df["Reviews"])
        #full_vec_reviews = PUT EVERYTHING BACK TOGETHER ???
        return vec, vec_reviews

    def vectorize_full_train(self):
        print(f"   Vecorizing full training data : {self.vec_name}")
        vec = self.vec
        X_train_full = vec.fit_transform(self.df["Reviews"])
        y_train_full = self.df["Sentiments"]
        #full_vec_reviews = PUT EVERYTHING BACK TOGETHER ???
        return vec, [X_train_full, y_train_full]

def data_preprocessor_train(filename, separator, header, use_cols, col_dict, smote = True):
    """Preprocess the train dataset (subset of the initial train dataset) to train the ML models in the benchmark phase"""
    print("Reading input data")
    df_init = pd.read_csv(filename, sep = separator, usecols = use_cols, header = header)
    df_init.rename(columns = col_dict, inplace = True)
    print(df_init.shape)
    print(df_init.head())
    df_preprocess = Preprocessor(df = df_init)
    print("Basic cleaning...")
    df_preprocess.basic_clean()
    print("NLP cleaning...")
    df_preprocess.nlp_clean()
    print("Train-Eval split...")
    df_preprocess.train_eval_split()
    train_vec, train_data_list = df_preprocess.final_split()
    if smote:
        print("Balancing dataset...")
        train_data_list = oversample(train_data_list)
    return train_vec, train_data_list  #vec, X_train, y_train, X_eval, y_eval

def data_preprocessor_test(filename, separator, header, use_cols, col_dict, vec): #index_col
    """Preprocess the eval dataset (subset of the initial train dataset) to evaluate the ML models in the benchmark phase"""
    print("Reading input data")
    df_init = pd.read_csv(filename, sep = separator,header = header, usecols = use_cols) #, index_col = index_col)
    df_init.rename(columns = col_dict, inplace = True)
    print(df_init.shape)
    print(df_init.head())
    df_preprocess = Preprocessor(df = df_init, train = False, vec = vec)
    print("Basic cleaning...")
    df_preprocess.basic_clean()
    print("NLP cleaning...")
    df_preprocess.nlp_clean()
    test_vec, test_data_list = df_preprocess.vectorize_test()
    return test_vec, test_data_list  #vec, X_test, y_test

def data_preprocessor_full_train(filename, separator, header, use_cols, col_dict, smote = True):
    """Preprocess the whole train dataset to make better final predictions"""
    print("Reading input data")
    df_init = pd.read_csv(filename, sep = separator, usecols = use_cols, header = header)
    df_init.rename(columns = col_dict, inplace = True)
    print(df_init.shape)
    print(df_init.head())
    df_preprocess = Preprocessor(df = df_init)
    print("Basic cleaning...")
    df_preprocess.basic_clean()
    print("NLP cleaning...")
    df_preprocess.nlp_clean()
    full_train_vec, full_train_data_list = df_preprocess.vectorize_full_train()
    # if smote:
    #     print("Balancing dataset...")
    #     full_train_data_list = oversample(full_train_data_list)
    return full_train_vec, full_train_data_list  #vec, X_train, y_train, X_eval, y_eval

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adrie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
# print("******************* PREPROCESSING TRAIN DATA *******************")

# train_vec, train_data_list = data_preprocessor_train("data/train.csv",",", 0, [3, 4], {"text":"Reviews","target":"Sentiments"}, False)
# # print(train_data_list)
# # print(train_vec)
# # print(len(train_data_list))
# for i in range(4):
#     print(train_data_list[i].shape)

print("******************* PREPROCESSING FULL TRAIN DATA *******************")

# preprocess full train dataset to use for final prediction with best model found in the benchmark phase
full_train_vec, full_train_data_list = data_preprocessor_full_train("data/train.csv",",", 0, [3, 4], {"text":"Reviews","target":"Sentiments"}, False)
for i in range(2):
    print(full_train_data_list[i].shape)

print("******************* PREPROCESSING TEST DATA *******************")
# CHANGE THE VEC !!!
test_vec, test_data = data_preprocessor_test("data/test.csv", ',', 0, [3], {"text":"Reviews"}, full_train_vec) #NO NEED FOR THE PHRASEID COLUMN (INDEX IS NOT NECESSARY) [0]
print(test_data.shape)
# #print(type(test_data))

******************* PREPROCESSING FULL TRAIN DATA *******************
Reading input data
(7613, 2)
                                             Reviews  Sentiments
0  Our Deeds are the Reason of this #earthquake M...           1
1             Forest fire near La Ronge Sask. Canada           1
2  All residents asked to 'shelter in place' are ...           1
3  13,000 people receive #wildfires evacuation or...           1
4  Just got sent this photo from Ruby #Alaska as ...           1
Basic cleaning...
   Dropping null values
   Getting rid of punctuation and HTML tags
   Removing repeating words
NLP cleaning...
   Removing stop words
   Lemmatizing reviews
   Vecorizing full training data : tfidf
(7613, 3576)
(7613,)
******************* PREPROCESSING TEST DATA *******************
Reading input data
(3263, 1)
                                             Reviews
0                 Just happened a terrible car crash
1  Heard about #earthquake is different cities, s...
2  there is a forest 

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

model_dict = {"svc" : SVC(max_iter=1000, random_state=0),
            "lr" : LogisticRegression(random_state=0, solver='liblinear', n_jobs = -1, max_iter=10000),
            "knc" : KNeighborsClassifier(),
            "random_forest":RandomForestClassifier(random_state=0, n_jobs=-1),
            "naive_bayes" : MultinomialNB()}

params_dict = {"svc" : {'C':[0.01,0.1,1,2,5,10,25], 'kernel':['linear', 'rbf', 'poly']}, 
            "lr" : {'C':[0.01,0.1,1,2,5,10,25], 'penalty':['l1','l2']},
            "knc" : {'n_neighbors':[3,5,10], 'weights':['uniform','distance']},
            "random_forest":{'n_estimators':[100,500,1000], 'max_depth':[2,3,5]}, #'penalty': ['l1', 'l2']
            "naive_bayes" : {'alpha':[1,2]}}

# from sklearn.metrics import f1_score, make_scorer
# f1 = make_scorer(f1_score , average='macro')

# def benchmark(train_data_list, test_data):
#     X_train = train_data_list[0]
#     y_train = train_data_list[1]
#     X_eval = train_data_list[2]
#     y_eval = train_data_list[3]
#     X_test = test_data
#     for model_item in model_dict.items():
#         model_name = model_item[0]
#         print(f"***************___{model_name}___***************")
#         model = model_item[1]
#         params = params_dict[model_name]
#         grid_clf_acc = GridSearchCV(model, param_grid = params,scoring = 'f1')
#         grid_clf_acc.fit(X_train, y_train)
#         print(grid_clf_acc.best_params_)
#         y_eval_pred = grid_clf_acc.predict(X_eval)
#         f1 = f1_score(y_eval,y_eval_pred)
#         print(f1)
#         print(classification_report(y_eval,y_eval_pred))
#         print(confusion_matrix(y_eval,y_eval_pred))
#         y_pred = grid_clf_acc.predict(X_test)
#         submission = pd.read_csv("data/test.csv",usecols=[0])
#         submission["target"] = y_pred  #pd.DataFrame({"id":[i for i in range(156061,222353)],"target":y_pred})
#         submission.to_csv(f"Submission_{model_name}.csv", index=False)

def benchmark_full_train(full_train_data_list, test_data):
    X_train = full_train_data_list[0]
    y_train = full_train_data_list[1]
    X_test = test_data
    for model_item in model_dict.items():
        model_name = model_item[0]
        print(f"***************___{model_name}___***************")
        model = model_item[1]
        params = params_dict[model_name]
        grid_clf_acc = GridSearchCV(model, param_grid = params,scoring = 'f1')
        grid_clf_acc.fit(X_train, y_train)
        print(grid_clf_acc.best_params_)
        y_pred = grid_clf_acc.predict(X_test)
        submission = pd.read_csv("data/test.csv",usecols=[0])
        submission["target"] = y_pred  #pd.DataFrame({"id":[i for i in range(156061,222353)],"target":y_pred})
        submission.to_csv(f"Submission_{model_name}.csv", index=False)


benchmark_full_train(full_train_data_list,test_data)    

***************___svc___***************




{'C': 0.01, 'kernel': 'linear'}
***************___lr___***************


  " = {}.".format(effective_n_jobs(self.n_jobs)))


{'C': 5, 'penalty': 'l2'}
***************___knc___***************
{'n_neighbors': 5, 'weights': 'distance'}
***************___random_forest___***************
{'max_depth': 5, 'n_estimators': 100}
***************___naive_bayes___***************
{'alpha': 1}


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


In [None]:
# model = LogisticRegression(random_state=0,multi_class="multinomial", solver='saga', C= 2, n_jobs = -1, max_iter=1000)
# X_train_full = full_train_data_list[0]
# y_train_full = full_train_data_list[1]
# model.fit(X_train_full,y_train_full)
# y_pred = model.predict(test_data)
# submission = pd.DataFrame({"PhraseId":[i for i in range(156061,222353)],"Sentiment":y_pred})
# submission.to_csv(f"Submission_lr_full_train.csv", index=False)