In [1]:
import pandas as pd
import numpy as np 
import re
from nltk.tokenize import word_tokenize, WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec 
import multiprocessing
from sklearn.model_selection import train_test_split
# SVM imports
from sklearn import svm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import csv

# K-NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

# NN imports
from sklearn.neural_network import MLPClassifier

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maity\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\maity\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Load the dataset as pandas

In [4]:
df_train_pos = pd.read_csv("twitter-datasets/train_pos.txt",sep="\t",names=['tweets'])
df_train_pos['label'] = 1
df_train_neg = pd.read_csv("twitter-datasets/train_neg.txt",sep="\t",names=['tweets'])
df_train_neg['label'] = -1
df_train = pd.concat([df_train_pos, df_train_neg])
print(df_train_pos.shape)
print(df_train_neg.shape)
print(df_train.shape)

(97902, 2)
(99068, 2)
(196970, 2)


In [5]:
df_train.head()

Unnamed: 0,tweets,label
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,<user> just put casper in a box ! looved the...,1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1


## Remove tags and urls

In [6]:
def remove_tags(df):
    df_cleaned = df.copy()
    df_cleaned['tweets'] = df_cleaned['tweets'].apply(lambda tweet: re.sub(r'<.*?>', '', tweet).strip())
    return df_cleaned

In [7]:
remove_tags(df_train)

Unnamed: 0,tweets,label
0,i dunno justin read my mention or not . only j...,1
1,"because your logic is so dumb , i won't even c...",1
2,just put casper in a box ! looved the battle ...,1
3,thanks sir > > don't trip lil mama ... just ke...,1
4,visiting my brother tmr is the bestest birthda...,1
...,...,...
99063,can't wait to fake tan tonight ! hate being pale,-1
99064,darling i lost my internet connection .. and i...,-1
99065,kanguru defender basic 4 gb usb 2.0 flash driv...,-1
99066,rizan is sad now,-1


## Removing stop words + stemming + lemmatization

In [8]:
def tokenize_and_preprocess(df, stop_words = False, stemming = False, lemmatization = False):
    df_cleaned = df.copy()
    df_cleaned['tokens'] = df_cleaned['tweets'].apply(lambda tweet: word_tokenize(tweet))
    # remove stop words
    if stop_words:
        stop_words = stopwords.words('english')
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in stop_words])
    # stemming
    if stemming:
        ps = PorterStemmer()
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [ps.stem(token) for token in tokens])
    # lemmatization
    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [wordnet_lemmatizer.lemmatize(token) for token in tokens])
    # remove the tweets columns
    df_cleaned.drop(['tweets'], axis=1, inplace=True)
    df_cleaned = df_cleaned.reindex(columns=['tokens', 'label'])
    return df_cleaned

In [9]:
tokens = list(tokenize_and_preprocess(remove_tags(df_train), stop_words = False, stemming = False, lemmatization = False).tokens)

#### Load the World2Vec model

In [10]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

8

In [11]:
def construct_word2vec_model(tokens, size, model_name):
    model = Word2Vec(tokens, min_count = 5, window = 10, vector_size = size, workers=cores-1)
    model.save('word2vec.model')
    
construct_word2vec_model(tokens, 100, 'word2vec.model')

In [12]:
model = Word2Vec.load('word2vec.model')

In [13]:
word_vectors = model.wv
print(word_vectors)

KeyedVectors<vector_size=100, 20753 keys>


In [14]:
def construct_vector(tokens):
    embeddings = []
    for token in tokens:
        if token in model.wv:
            embeddings.append(model.wv[token])
    if len(embeddings) == 0:
        return np.zeros(100)
    return np.average (embeddings,axis=0)

## Test with word2vec

In [16]:
df_cleaned_tweets = remove_tags(df_train)
df_cleaned_tweets = tokenize_and_preprocess(df_cleaned_tweets)
df_cleaned_tweets.head()

Unnamed: 0,tokens,label
0,"[i, dunno, justin, read, my, mention, or, not,...",1
1,"[because, your, logic, is, so, dumb, ,, i, wo,...",1
2,"[just, put, casper, in, a, box, !, looved, the...",1
3,"[thanks, sir, >, >, do, n't, trip, lil, mama, ...",1
4,"[visiting, my, brother, tmr, is, the, bestest,...",1


## Constructing the vectors embeddings

In [17]:
df_cleaned_tweets['vectors'] = df_cleaned_tweets['tokens'].apply(lambda token: construct_vector(token))

In [18]:
df_cleaned_tweets

Unnamed: 0,tokens,label,vectors
0,"[i, dunno, justin, read, my, mention, or, not,...",1,"[0.02181764, 0.28991354, -0.6262814, 0.6142028..."
1,"[because, your, logic, is, so, dumb, ,, i, wo,...",1,"[0.25842845, -0.14536653, -0.8079718, 0.480807..."
2,"[just, put, casper, in, a, box, !, looved, the...",1,"[-0.48266414, 0.301264, 0.28670168, 0.52260065..."
3,"[thanks, sir, >, >, do, n't, trip, lil, mama, ...",1,"[-0.40605763, -0.5459495, 0.13085735, 0.183085..."
4,"[visiting, my, brother, tmr, is, the, bestest,...",1,"[-0.24251671, 0.40992773, -0.4513451, 0.080455..."
...,...,...,...
99063,"[ca, n't, wait, to, fake, tan, tonight, !, hat...",-1,"[-0.6172069, -0.078211434, -0.08028211, 0.5522..."
99064,"[darling, i, lost, my, internet, connection, ....",-1,"[-0.48892963, 0.26758033, -0.39285302, 0.94380..."
99065,"[kanguru, defender, basic, 4, gb, usb, 2.0, fl...",-1,"[-0.16174838, 0.5098958, 0.13095143, -0.360084..."
99066,"[rizan, is, sad, now]",-1,"[-0.32928637, 0.15658653, -0.9092396, 1.145558..."


## Test models

In [19]:
def train_test_model(model_to_use,x_train,y_train,x_test):
    
    if model_to_use == "LogisticRegression":
        logisticRegr = LogisticRegression(max_iter=4000)
        logisticRegr.fit(x_train, y_train) 
        return logisticRegr.predict(x_test)
    
    if model_to_use == "SVM":
        # Use linear SVM for optimization (text classification is often linear)
        lin_clf = svm.LinearSVC()
        lin_clf.fit(x_train,y_train)
        return lin_clf.predict(x_test)
    
    if model_to_use == "NeuralNetwork":
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1,max_iter=4000)
        clf.fit(x_train, y_train)
        return clf.predict(x_test)

In [20]:
x_train = np.array(df_cleaned_tweets["vectors"].tolist())
y_train =  df_train["label"].tolist()

In [21]:
tweets = []
ids = []
with open("twitter-datasets/test_data.txt") as f:
    for line in f:
        id, tweet = line.split(',', 1)

        tweets.append(tweet)
        ids.append(id)

    df_test = pd.DataFrame(list(zip(ids, tweets)), columns=['id', 'tweets'])

In [22]:
df_test = tokenize_and_preprocess(remove_tags(df_test))

In [23]:
df_test.head()

Unnamed: 0,tokens,label
0,"[sea, doo, pro, sea, scooter, (, sports, with,...",
1,"[shucks, well, i, work, all, week, so, now, i,...",
2,"[i, cant, stay, away, from, bug, thats, my, baby]",
3,"[no, ma'am, !, !, !, lol, im, perfectly, fine,...",
4,"[whenever, i, fall, asleep, watching, the, tv,...",


In [24]:
df_test["vectors"] = df_test["tokens"].apply(lambda tokens: construct_vector(tokens)) 

In [29]:
df_test

Unnamed: 0,tokens,label,vectors
0,"[sea, doo, pro, sea, scooter, (, sports, with,...",,"[-0.3143011, 0.5532294, 0.047142576, -0.275131..."
1,"[shucks, well, i, work, all, week, so, now, i,...",,"[-0.3314672, 0.0637904, -0.36956653, 0.8440420..."
2,"[i, cant, stay, away, from, bug, thats, my, baby]",,"[0.09588291, -0.057890374, -1.1286697, 0.63654..."
3,"[no, ma'am, !, !, !, lol, im, perfectly, fine,...",,"[0.3203415, -0.56886244, -0.44774532, 0.755875..."
4,"[whenever, i, fall, asleep, watching, the, tv,...",,"[-0.8357741, 0.1895941, -0.48067373, 0.6500084..."
...,...,...,...
9995,"[had, a, nice, time, w, /, my, friend, lastnite]",,"[-0.49510616, 1.4082574, -0.7341637, 0.8630101..."
9996,"[no, it, 's, not, !, please, stop, !]",,"[-0.45468384, -0.09401141, -1.0524045, 0.60986..."
9997,"[not, without, my, daughter, (, dvd, two-time,...",,"[-0.40592158, 0.1365474, -0.24947916, -0.08719..."
9998,"[have, fun, in, class, sweetcheeks]",,"[-0.54124427, 0.07986082, 1.1179461, 0.9593057..."


In [26]:
x_test = df_test["vectors"].tolist()
x_test

[array([-0.3143011 ,  0.5532294 ,  0.04714258, -0.2751314 ,  0.31818092,
        -0.04277517, -0.41522083,  0.7561016 ,  0.2611081 , -0.6523372 ,
        -0.39215255, -0.50573844, -0.28559628, -0.44800955,  0.152234  ,
        -1.05679   , -0.25219148, -0.45640177,  0.18784277, -0.50002617,
        -1.0466837 ,  0.00945948, -0.7581415 , -0.27405757, -0.24371396,
         0.30430093, -0.85820657,  0.00711154,  0.33094373, -0.14885084,
        -0.34353575,  0.14662755, -0.6947613 , -0.08085896, -0.17408954,
         0.28387076, -0.2155703 , -0.3428424 , -0.20694108, -0.03348693,
         0.2702226 , -0.3330377 , -0.09965073,  0.98090214,  0.3322804 ,
         0.10045293, -0.24139239,  0.2238913 ,  0.21290791,  0.5197571 ,
        -0.4111025 ,  0.03553316, -0.2949139 , -0.37157053, -0.6041411 ,
        -0.26161453,  0.11435214, -0.09598563, -0.18780242, -0.03755728,
        -0.15375122, -0.3532946 ,  0.05339208,  0.20240316,  0.19203289,
         0.20001972, -0.687775  ,  0.3416968 , -0.1

In [27]:
predictions = train_test_model("NeuralNetwork",x_train,y_train,x_test)

In [28]:
ids=[i for i in range(1,len(predictions)+1)]
with open("submission.csv", 'w', newline='') as csvfile:
    fieldnames = ['Id', 'Prediction']
    writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
    writer.writeheader()
    for r1, r2 in zip(ids, predictions):
        writer.writerow({'Id':int(r1),'Prediction':int(r2)})