In [None]:
import pandas as pd
import re
import math
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier



#importing dataset
dataset=pd.read_csv('train_E6oV3lV.csv')

#Data preprocessing phase
corpus = []
for i in range(0, 31962):
    review = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
cv = CountVectorizer(max_features = 2000)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)


In [None]:
#training neural networks

nn_classifier=Sequential()
nn_classifier.add(Dense(input_dim=2000,units=750,activation='relu',kernel_initializer='uniform'))
nn_classifier.add(Dropout(rate=0.2))
nn_classifier.add(Dense(units=750,activation='relu',kernel_initializer='uniform'))
nn_classifier.add(Dropout(rate=0.2))
nn_classifier.add(Dense(units=750,activation='relu',kernel_initializer='uniform'))
nn_classifier.add(Dropout(rate=0.2))
nn_classifier.add(Dense(units=1,activation='sigmoid',kernel_initializer='uniform'))
nn_classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
path='tsa_weights.{epoch:02d}-{loss:.2f}.hdf5'  
mcp=ModelCheckpoint(path,monitor='loss',save_best_only=True,verbose=0)
nn_classifier.fit(x_train,y_train,epochs=100,batch_size=30,callbacks=[mcp])


In [None]:
sent_pred=nn_classifier.predict(x_test)   
test_set=(sent_pred>0.5)
nn_cm=confusion_matrix(y_test,test_set)
nn_accuracy=(nn_cm[0,0]+nn_cm[1,1])/(nn_cm[0,0]+nn_cm[1,1]+nn_cm[0,1]+nn_cm[1,0])*100
nn_precision=(nn_cm[0,0])/(nn_cm[0,0]+nn_cm[0,1])
nn_recall=(nn_cm[0,0])/(nn_cm[0,0]+nn_cm[1,0])
nn_f1_score=(2*nn_precision*nn_recall)/(nn_precision+nn_recall)
nn_fpr,nn_tpr,nn_threshold=roc_curve(test_set,y_test)
nn_roc_auc = auc(nn_fpr,nn_tpr)

print("Accuracy of NN is {}%".format(math.floor(nn_accuracy)))
print("Precision of NN is {}%".format(math.floor(nn_precision*100)))
print("Recall of NN is {}%".format(math.floor(nn_recall*100)))
print("F1_score of NN is {}%".format(math.floor(nn_f1_score*100)))
print("ROC_curve of NN is {}%".format(math.floor(nn_roc_auc*100)))

#nb_roc_curve graph
plt.title('TSA Receiver Operating Characteristic')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.plot(nn_fpr,nn_tpr)
plt.plot([0, 1], [0, 1],'r--')
plt.savefig('TSA.png')



In [87]:

#Deploying model
nn_classifier=Sequential()
nn_classifier.add(Dense(input_dim=2000,units=750,activation='relu',kernel_initializer='uniform'))
nn_classifier.add(Dropout(rate=0.2))
nn_classifier.add(Dense(units=750,activation='relu',kernel_initializer='uniform'))
nn_classifier.add(Dropout(rate=0.2))
nn_classifier.add(Dense(units=750,activation='relu',kernel_initializer='uniform'))
nn_classifier.add(Dropout(rate=0.2))
nn_classifier.add(Dense(units=1,activation='sigmoid',kernel_initializer='uniform'))
nn_classifier.load_weights("tsa_weights.06-0.01.hdf5")

#data preprocessing of new predictions
dataset_x=pd.read_csv('test_tweets_anuFYb8.csv')
#Data preprocessing phase
ps = PorterStemmer()
corpus_x = []
for i in range(0, 17197):
    clean_tweets = re.sub('[^a-zA-Z]', ' ', dataset_x['tweet'][i])
    clean_tweets = clean_tweets.lower()
    clean_tweets = clean_tweets.split()
    clean_tweets = [ps.stem(word) for word in clean_tweets if not word in set(stopwords.words('english'))]
    clean_tweets = ' '.join(clean_tweets)
    corpus_x.append(clean_tweets)


X_new_test = cv.transform(corpus_x).toarray()
predicted_new=nn_classifier.predict(X_new_test)

In [76]:
a=1
b=0
rr=[]

for predicted_new_v in predicted_new:
                if (predicted_new_v > 0.7):
                        rr.append(a)
                else:
                        rr.append(b)
            

In [78]:
from pandas import DataFrame
dataa={'ID':dataset_x['id'].values,'TWEETS':rr}
df=DataFrame(dataa)
df.to_excel('test_sa.xlsx', sheet_name='sheet1', index=False)

In [None]:
#cross validation data
def build_classifier():
    nn_classifier=Sequential()
    nn_classifier.add(Dense(input_dim=1500,units=750,activation='relu',kernel_initializer='uniform'))
    nn_classifier.add(Dropout(rate=0.2))
    nn_classifier.add(Dense(units=750,activation='relu',kernel_initializer='uniform'))
    nn_classifier.add(Dropout(rate=0.2))
    nn_classifier.add(Dense(units=750,activation='relu',kernel_initializer='uniform'))
    nn_classifier.add(Dropout(rate=0.2))
    nn_classifier.add(Dense(units=1,activation='sigmoid',kernel_initializer='uniform'))
    nn_classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    return nn_classifier


classifier = KerasClassifier(build_fn = build_classifier, batch_size = 10, epochs = 100)
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10, n_jobs = -1)
mean = accuracies.mean()
variance = accuracies.std()



In [None]:
#grid search cv
def build_classifier(optimizer):
    nn_classifier=Sequential()
    nn_classifier.add(Dense(input_dim=2000,units=750,activation='relu',kernel_initializer='uniform'))
    nn_classifier.add(Dropout(rate=0.2))
    nn_classifier.add(Dense(units=750,activation='relu',kernel_initializer='uniform'))
    nn_classifier.add(Dropout(rate=0.2))
    nn_classifier.add(Dense(units=750,activation='relu',kernel_initializer='uniform'))
    nn_classifier.add(Dropout(rate=0.2))
    nn_classifier.add(Dense(units=1,activation='sigmoid',kernel_initializer='uniform'))
    nn_classifier.compile(optimizer=optimizer,loss='binary_crossentropy')
    return nn_classifier

classifier = KerasClassifier(build_fn = build_classifier)
parameters={'optimizer':['adam','rmsprop','sgd']}
gscv=GridSearchCV(estimator=classifier,param_grid=parameters,cv=10,scoring='f1')
gscv_model=gscv.fit(x_train,y_train,epochs=10,batch_size=30)

gscv.best_params_
gscv.best_score_