In [1]:
import csv, nltk, pickle, re, time
import pandas as pd
import numpy as np
from io import StringIO
from collections import Counter
from scipy import sparse
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC

# need to use once to download nltk (natural language processing library) on your computer.
# nltk.download()

In [2]:
with open("twitter-datasets/train_neg_proc.txt", "r", encoding="utf8") as myfile:
    neg_DF = pd.read_csv(myfile, header=None)
with open("twitter-datasets/train_pos_proc.txt", "r", encoding="utf8") as myfile:
    pos_DF = pd.read_csv(myfile, header=None)
with open("twitter-datasets/test_data_proc.txt", "r", encoding="utf8") as myfile:
    test_DF = pd.read_csv(myfile, header=None)

In [None]:
neg_DF.head()

<h2>TF-IDF</h2>

Create the vectorizer. We go with the idea that we do not want the words that appear in less than 5 tweets and in more than 80% of the tweets.

In [None]:
# create the vectoriser
vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf =True)

We now need to create a corpus. Our train set would both positive and negative, and our test set is, obviously, the unlabeled part.

To do this, we will append both negative and positive DF, then create a matrix of labels for them.

In [None]:
# we thus know that all the first ones are labeled as -1 and all the others as 1
all_labeled_DF = pd.concat([neg_DF, pos_DF])

In [None]:
# we create the labels
negs = len(neg_DF.index)
poss = len(pos_DF.index)
labels = np.zeros(negs+poss)
labels[0:negs]=-1
labels[negs:negs+poss]=1 

In [None]:
train_corpus_tf_idf = vectorizer.fit_transform(all_labeled_DF) 
test_corpus_tf_idf = vectorizer.transform(test_DF)

In [None]:
# create both models
model1 = LinearSVC() # SVM
model2 = MultinomialNB()

In [None]:
# train on the given models
model1.fit(train_corpus_tf_idf,labels)
model2.fit(train_corpus_tf_idf,labels)

In [None]:
# predictions
result1 = model1.predict(test_corpus_tf_idf)
result2 = model2.predict(test_corpus_tf_idf)

Result1 and result2 are the labels predicted for the tweets we got in the test corpus. This means we probably jsute have to transforme this into a csv as it is shown in the sample submission.

In [None]:
# Converting it to integer for prediction csv
result1 = [int(x) for x in result1]
result2 = [int(x) for x in result2]

In [None]:
svm_df = pd.DataFrame(result1)
svm_df['Id'] = svm_df.index + 1
svm_df['Prediction'] = svm_df[0]
svm_df = svm_df[['Id', 'Prediction']]
svm_df.to_csv('svm.csv', index=False)

In [None]:
bayes_df = pd.DataFrame(result2)
bayes_df['Id'] = bayes_df.index + 1
bayes_df['Prediction'] = bayes_df[0]
bayes_df = bayes_df[['Id', 'Prediction']]
bayes_df.to_csv('bayes.csv', index=False)