### Naive Bayes Model

In [101]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import re

In [102]:
df = pd.read_csv("../data/raw/train.csv")

In [103]:
#Replace multiple space with one space
def preprocess(string):
    tweet = string.lower()
    #Remove links
    tweet = re.sub(r"http\S+", "", tweet)
    #Remove pics
    tweet = re.sub(r"pic.twitter\S+", "", tweet)
    #Remove @
    tweet = re.sub(r"@\S+", "", tweet)
    #Remove #
    tweet = re.sub(r"#\S+", "", tweet)
    #Replace new line or tab with space
    tweet = re.sub(r"[\n\t]+", " ", tweet)
    #Only keeping words
    tweet = re.sub(r"[^a-zA-Z ]+", "", tweet)
    #Removing multiple spaces
    tweet = re.sub(r"[\s\s]+", " ", tweet)
    #Remove leading spaces
    tweet=tweet.strip()

    return tweet

In [104]:
def transform_train_data(df, tweet_column_name, label_column_name):
    data = df.copy()
    data["label"] = data[label_column_name].replace("Quality", 0).replace("Spam", 1)
    data["Process_tweet"] = data[tweet_column_name].apply(preprocess)
    return data[["Process_tweet", "label"]]

In [105]:
pre_preprocessing = df[["Tweet", "Type"]].head()

In [106]:
pre_preprocessing.to_csv("pre_preprocess.csv", index="False")

In [107]:
model_df.head().to_csv("preprocess.csv", index="False")

In [108]:
model_df = transform_train_data(df, "Tweet", "Type")

In [109]:
X_train, X_test, Y_train, Y_test = train_test_split(model_df['Process_tweet'], 
                                                    model_df['label'], 
                                                    random_state=1)

In [110]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1)).fit(X_train)
x_train_vectorized = vectorizer.transform(X_train)

In [111]:
x_train_vectorized.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [112]:
from sklearn.model_selection import GridSearchCV

In [113]:
parameters = {"alpha":[.01, .1, .5 , 1, 2]}
grid_search = GridSearchCV(MultinomialNB(), parameters, cv=5)
results = grid_search.fit(x_train_vectorized, Y_train)

In [114]:
results.best_params_

{'alpha': 0.5}

In [115]:
model = MultinomialNB(alpha=.5) #Multinominal looks at the occurence count
model.fit(x_train_vectorized, Y_train)

MultinomialNB(alpha=0.5)

In [116]:
from sklearn.metrics import confusion_matrix

In [117]:
predictions = model.predict(vectorizer.transform(X_test))

In [118]:
accuracy = 100*sum(predictions==Y_test)/len(predictions)
accuracy

79.47860962566845

In [119]:
cm = confusion_matrix(Y_test, predictions, labels=[0, 1])

In [120]:
pd.DataFrame(data=cm).to_csv("confusion_matrix.csv")

### Testing saving model

In [136]:
import joblib

In [137]:
filename1="final_naivebayes.sav"
joblib.dump(model,filename)

['vectorizer.sav']

In [138]:
filename2="vectorizer.sav"
joblib.dump(vectorizer,filename)

['vectorizer.sav']

In [139]:
test = joblib.load(filename1)

In [140]:
vector = joblib.load(filename2)

In [141]:
predict = test.predict(vector.transform(X_test))

In [142]:
accuracy = 100*sum(predict==Y_test)/len(predict)

In [143]:
accuracy

79.47860962566845

In [144]:
vector.transform("JO, my")

ValueError: Iterable over raw text documents expected, string object received.

In [146]:
test.predict(vector.transform(pd.Series(["Hi, my name is"])))[0]

0