 # DeepTweets Classification

#### Team name : PGX- DS-T3256
#### User-name : ORCL-DS-APP3791

In [None]:
# libraries used for dataset preparation, feature engineering, model training 
import pandas as pd
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas, numpy, textblob, string
from nltk.corpus import stopwords
import re
from textblob import Word
from sklearn.ensemble import VotingClassifier

# 1. Dataset Preparation

In [None]:
# loading the train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In this step i did some basic pre-processing such as removing punctuation, stopwords, digits, Lemmatization and i also observed that the dataset contains a lot of hyperlinks so i did removed them at last and it gived a pretty good improvements.

In [None]:
stop = stopwords.words('english')
# Lower case
train['TweetText'] = train['TweetText'].apply(lambda x: " ".join(x.lower() for x in x.split()))
test['TweetText'] = test['TweetText'].apply(lambda x: " ".join(x.lower() for x in x.split()))
# Removing punctuation
train['TweetText'] = train['TweetText'].str.replace('[^\w\s]','')
test['TweetText'] = test['TweetText'].str.replace('[^\w\s]','')
# removing stopwords
train['TweetText'] = train['TweetText'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test['TweetText'] = test['TweetText'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# Lemmatization
train['TweetText'] = train['TweetText'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
test['TweetText'] = test['TweetText'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
# removing digits
train['TweetText'] = train['TweetText'].apply(lambda x: ' '.join([x for x in x.split() if not x.isdigit()]))
test['TweetText'] = test['TweetText'].apply(lambda x: ' '.join([x for x in x.split() if not x.isdigit()]))
# removing links
for i in range(len(train['TweetText'])):
    train['TweetText'][i] = re.sub(r"http\S+", '', train['TweetText'][i])
for i in range(len(test['TweetText'])):
    test['TweetText'][i] = re.sub(r"http\S+", '', test['TweetText'][i])

In [None]:
# split the data into training and validation set
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['TweetText'], train['Label'])
# and finaly label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

# 2. Feature Engineering

In this step i transformed raw text data into feature vectors such as the count Vectors and the TF-IDF vectors for a word level and also n-grams level.

In [None]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(train['TweetText'])
count_vect.fit(test['TweetText'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)
xtest_count =  count_vect.transform(test['TweetText'])

In [None]:
# create a word level tf-idf vector
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train['TweetText'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
xtest_tfidf =  tfidf_vect.transform(test['TweetText'])

In [None]:
# create a ngram level tf-idf vector
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), max_features=5000)
tfidf_vect_ngram.fit(train['TweetText'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test['TweetText'])

# 3. Model Training

 In this step i trained several models on the three vectors that i created in the step before and i displayed the accuracy of each one for a comparaison between them.

In [None]:
# this is simple function to train a model and display the accuracy
def model_training(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
# Naive Bayes on Count Vectors
accuracyc = model_training(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracyc)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = model_training(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = model_training(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

In [None]:
# Linear Classifier on Count Vectors
accuracy = model_training(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = model_training(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = model_training(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("LR, N-Gram Vectors: ", accuracy)

In [None]:
# RF on Count Vectors
accuracy = model_training(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = model_training(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)
# RF on Word Level TF IDF Vectors
accuracy = model_training(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("RF, WordLevel n-grams TF-IDF: ", accuracy)

for the final submission i used the Naive Bayes on Word Level TF-IDF model as it performed the best on the validation set.

In [None]:
# Naive Bayes on word level
model_nv = naive_bayes.MultinomialNB().fit(xtrain_tfidf, train_y)
predictions = model_nv.predict(xtest_tfidf)
pred = ['Politics' if i == 0 else 'Sports' for i in predictions]
pd.DataFrame({'TweetId': test['TweetId'], 'Label': pred}).to_csv('final_submission.csv', index=False)

I also tried combining different models, i used a basic ensemble technique called max-voting on different models but it didn't improve the final results. Here is the code below for this model (i used the VotingClassifier from sklearn)

In [None]:
model_nv = naive_bayes.MultinomialNB()
model_rf = ensemble.RandomForestClassifier()
model_ln = linear_model.LogisticRegression()
model_svm = linear_model.SGDClassifier()

model_vote = VotingClassifier(estimators=[('nv', model_nv), ('ln', model_ln),('rf', model_rf), ('svm',model_svm)], voting='hard')
model_vote.fit(xtrain_tfidf,train_y)

predictions = model_vote.predict(xtest_tfidf)
pred = ['Politics' if i == 0 else 'Sports' for i in predictions]
pd.DataFrame({'TweetId': test['TweetId'], 'Label': pred}).to_csv('results_stacked.csv', index=False)

# 4. Improvements

Collect Data : We can collect more data from Twitter using the Twitter API and label them for the training part because the training data for the competition is very poor but still balanced. In the collect we also need to keep the balance for a robust model.