# Data Engineering II Project Notebook

by Mamoun LAHLOU & Grégoire NOWACKI

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold

import pickle

### Functions

In [17]:
def data_preprocessing(df):
    
    df = df.drop(['ids', 'date', 'flag', 'user'], axis = 'columns')

    df['text_processed'] = df['text'].str.replace(r'@\S+', '')
    df['text_processed'] = df['text_processed'].str.replace(r'http\S+','')
    df['text_processed'] = df['text_processed'].str.replace('[^a-zA-Z]',' ')
    df['text_processed'] = df['text_processed'].str.lower()
    df['text_processed'] = df['text_processed'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))
    
    return df

def model_training(df):
    
    X = df['text_processed']
    y = df['target']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
    
    vec = CountVectorizer()
    X_train_transformed = vec.fit_transform(X_train)
    X_test_transformed = vec.transform(X_test)
    
    model = LinearSVC()
    model.fit(X_train_transformed, y_train)
    
    y_pred = model.predict(X_test_transformed)
    accuracy = accuracy_score(y_test, y_pred)
    
    return model, accuracy, vec

In [18]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding="ISO-8859-1", names=["target", "ids", "date", "flag", "user", "text"])

df = data_preprocessing(df)

model, accuracy, vec = model_training(df)

In [22]:
pickle.dump(model, open('finalized_model.sav', 'wb'))
pickle.dump(vec, open('vec.sav', 'wb'))

In [24]:
loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
vec = pickle.load(open('vec.sav', 'rb'))

loaded_model.predict(vec.transform(pd.Series('I am sad but I am happy at the same time fuck good good good good')))

array([4], dtype=int64)

In [22]:
y_pred = model.predict(X_test_transformed)

In [23]:
accuracy_score(y_test, y_pred)

0.7886287878787879

In [27]:
model.predict(vec.transform(pd.Series('I am sad but I am happy at the same time fuck good good good good')))[0]

4

In [7]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [None]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001)

random_search.fit(X_train_transformed, y_train)

In [None]:
print(random_search.best_estimator_)

In [54]:
vector = TfidfVectorizer(ngram_range=(1, 2))

In [56]:
X_training = vector.fit_transform(X_train)
X_testing = vector.transform(X_test)

In [57]:
model = LinearSVC()
model.fit(X_training, y_train)

LinearSVC()

In [58]:
y_prediction = model.predict(X_testing)
accuracy = accuracy_score(y_test, y_prediction)

In [59]:
accuracy

0.8215321969696969

array([[204096,  59758],
       [ 51844, 212302]], dtype=int64)