In [51]:
import pandas as pd 
import re
import string
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from prettytable import PrettyTable

train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [52]:
#Cleanup the strings so we only get important words

#Remove any urls
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

#Remove any extra html
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

#Remove any emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Remove puncations and hashtags
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def clean_tweet(text):
    text = remove_url(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = remove_punct(text)
    return text

#Only apply this to train_df
train_df['text'] = train_df['text'].apply(lambda x : clean_tweet(x))

In [53]:
# Split the data into train and validation sets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Convert the text into vectors
vectorizer = CountVectorizer()

x_train, x_test, y_train, y_test = train_test_split(train_df['text'], train_df['target'], test_size=0.3, random_state=42)

#After splitting into train and test, tokenize
x_train_vectors = vectorizer.fit_transform(x_train)
x_test_vectors = vectorizer.transform(x_test)

In [54]:
clf_KNN = KNeighborsClassifier(n_neighbors=5)
clf_DTC = DecisionTreeClassifier()
clf_LOG = LogisticRegression()
clf_MNB = MultinomialNB()
clf_BNB = BernoulliNB()
clf_SGD = SGDClassifier()
clf_RFC = RandomForestClassifier()
clf_GBC = GradientBoostingClassifier()

classifiers = {
    "knn_classifier": clf_KNN,
    "decision_tree_classifier": clf_DTC,
    "logistic_regression": clf_LOG,
    "multinomial_nb_classifier": clf_MNB,
    "bernoulli_nb_classifier": clf_BNB,
    "sgd_classifier": clf_SGD,
    "random_forest_classifier": clf_RFC,
    "gradient_boosting_classifier": clf_GBC
}

table = [["classifier", "train_test_split accuracy score"]]

for name, clf in classifiers.items(): 
    clf.fit(x_train_vectors, y_train)
    y_predicted = clf.predict(x_test_vectors)
    accuracy = accuracy_score(y_test, y_predicted)
    table.append([name, accuracy])
    
tab = PrettyTable(table[0])
tab.add_rows(table[1:])
print(tab)

+------------------------------+---------------------------------+
|          classifier          | train_test_split accuracy score |
+------------------------------+---------------------------------+
|        knn_classifier        |        0.6711908931698775       |
|   decision_tree_classifier   |        0.7329246935201401       |
|     logistic_regression      |        0.8064798598949212       |
|  multinomial_nb_classifier   |        0.8056042031523643       |
|   bernoulli_nb_classifier    |        0.8112959719789843       |
|        sgd_classifier        |        0.7837127845884413       |
|   random_forest_classifier   |        0.7915936952714536       |
| gradient_boosting_classifier |        0.7324868651488616       |
+------------------------------+---------------------------------+


In [55]:
# retrain with full training dataset

def generate_submission_file(clf_name, clf):
    filename = "submissions/" + clf_name + "_submission.csv"
    f = open(filename, "w")
    f.write("id,target\n")
    for i in range (len(y_predicted)):
        index = test_df['id'][i]
        f.write(str(index) + "," + str(y_predicted[i]) + "\n")
    f.close()

x_train = train_df['text'].apply(lambda x : clean_tweet(x))
y_train = train_df['target']
x_test = test_df['text'].apply(lambda x : clean_tweet(x))

vectorizer = CountVectorizer()
x_train_vectors = vectorizer.fit_transform(x_train)
x_test_vectors = vectorizer.transform(x_test)

# print(x_train_vectors)
clf_KNN = KNeighborsClassifier(n_neighbors=5)
clf_DTC = DecisionTreeClassifier()
clf_LOG = LogisticRegression(max_iter=400)
clf_MNB = MultinomialNB()
clf_BNB = BernoulliNB()
clf_SGD = SGDClassifier()
clf_RFC = RandomForestClassifier()
clf_GBC = GradientBoostingClassifier()

classifiers = {
    "knn_classifier": clf_KNN,
    "decision_tree_classifier": clf_DTC,
    "logistic_regression": clf_LOG,
    "multinomial_nb_classifier": clf_MNB,
    "bernoulli_nb_classifier": clf_BNB,
    "sgd_classifier": clf_SGD,
    "random_forest_classifier": clf_RFC,
    "gradient_boosting_classifier": clf_GBC
}

for name, clf in classifiers.items(): 
    clf.fit(x_train_vectors, y_train)
    y_predicted = clf.predict(x_test_vectors)
    generate_submission_file(name, clf)