In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (6,6)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from keras.layers import Dense, Input, Activation, Conv1D
from keras.layers import Dropout, MaxPooling1D, Flatten, Concatenate, Reshape
from keras.models import Sequential, Model, load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import np_utils

import re
import random
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amaan_Hussain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [2]:
def load_as_list(fname):
    df = pd.read_csv(fname)
    id = df['id'].values.tolist()
    label = df['label'].values.tolist()
    tweets = df['tweet'].values.tolist()
    return tweets, label

In [3]:
# getting the tweets and labels in a pandas dataframe
tweets, label = load_as_list("train.csv")
# downloading the stopwords from nltk library
stop_words = stopwords.words('english')
# print(stop_words)

print(len(tweets))

# manually seperating 80% of the train data
x_train = tweets[:25570]

#removing all punctuations from the train data
for i in range(len(x_train)):
    x_train[i] = re.sub(r'[^\w\s]', '', x_train[i])
    

# print(x_train[:15])
print(len(x_train))
print(x_train[:15])



31962
25570
[' user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction   run', 'user user thanks for lyft credit i cant use cause they dont offer wheelchair vans in pdx    disapointed getthanked', '  bihday your majesty', 'model   i love u take with u all the time in urð ðððð\x85ððð  ', ' factsguide society now    motivation', '22 huge fan fare and big talking before they leave chaos and pay disputes when they get there allshowandnogo  ', ' user camping tomorrow user user user user user user user dannyâ', 'the next school year is the year for examsð cant think about that ð school exams   hate imagine actorslife revolutionschool girl', 'we won love the land allin cavs champions cleveland clevelandcavaliers  â ', ' user user welcome here   im   its so gr8  ', ' â ireland consumer price index mom climbed from previous 02 to 05 in may   blog silver gold forex', 'we are so selfish orlando standwithorlando pulseshooting orlandoshooting biggerproblems self

In [4]:
# removing stop_words from tf_idf and setting min occurences of a word to 30
tfidf = TfidfVectorizer(stop_words=stop_words,min_df=30)

x = tfidf.fit_transform(x_train).toarray()
y_train = np.array(label[:25570])

y_train = np.reshape(y_train, (-1,1))

# print(type(x),x.shape)
# print(x[:2])
tfidf_sentence_value = []

for i in range(len(x)):
    count = 0
    num_count = len([ele for ele in x[i] if ele > 0])
    if num_count > 0:
        tfidf_sentence_value.append([sum(x[i]) / num_count]) 
    else:
        tfidf_sentence_value.append([0])
        
count = 0
for ele in range(len(y_train)):
    if y_train[ele] == 0:
        count = count + 1
print(f"Number of labels 0 is: {count}")
print(f"Number of labels 1 is: {len(y_train) - count}")
        
# print(len(tfidf_sentence_value))
tfidf_values = np.array(tfidf_sentence_value)
# print(tfidf_values.shape)
# print(y_train[:5])
# tfidf_values.reshape(-1)
# print(tfidf_values.shape)
# print(tfidf_values[:5])

print(f"TFIDF Transform Shape: {x.shape} with length: {len(x)}")
print(f"Sample: \n{x[:5]}\n\n")
print(f"TFIDF Value per Sentence Shape: {tfidf_values.shape} with length: {len(tfidf_values)}")
print(f"Labels Shape: {y_train.shape} with length: {len(y_train)}")
    

Number of labels 0 is: 23771
Number of labels 1 is: 1799
TFIDF Transform Shape: (25570, 1005) with length: 25570
Sample: 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


TFIDF Value per Sentence Shape: (25570, 1) with length: 25570
Labels Shape: (25570, 1) with length: 25570


In [5]:
model = Sequential([
    Dense(1024, activation='relu', input_shape=(1005,)),
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid'),
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(x,y_train,epochs=100,batch_size=100)

_, accuracy = model.evaluate(x, y_train)
print('Accuracy: %.2f' % (accuracy*100))

with open('modelsummary.txt', 'w') as f:
    model.summary(print_fn=lambda x: f.write(x + '\n'))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100


Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Accuracy: 99.34
