In [1]:
import re
import pandas as pd
import numpy as np
import joblib
import pickle
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

nltk.download('stopwords')

stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
def loading_tweets(file_name):
  df = pd.read_csv(file_name, header=None)
  df = df.rename({0:'id',1:'company', 2:'sentiment', 3:'raw_tweet'}, axis=1)
  df = df[df['sentiment'] != 'Irrelevant']
  df = df[df['sentiment'] != 'Neutral']
  df = df.dropna()
  return df['raw_tweet'].values, df

def pre_processing_tweets(tweets):
  processed_tweets = []

  for tweet in tqdm(tweets, desc="Processing tweets", unit='tweet'):
    tweet = re.sub(r'[^a-zA-Z]',' ', tweet)
    tweet = tweet.lower()
    tweet = [word for word in tweet.split(' ') if not word in stopwords.words('english')]
    tweet = [stemmer.stem(word) for word in tweet]
    tweet = [word for word in tweet if len(word) != 0]
    tweet = ' '.join(tweet)
    processed_tweets.append(tweet)

  return processed_tweets



In [5]:
train_tweets, df_train = loading_tweets('twitter_training.csv')
train_tweets = pre_processing_tweets(train_tweets)

tf_idf = TfidfVectorizer()
train_tweets = tf_idf.fit_transform(train_tweets)

x_train = train_tweets.toarray()

# converting only sentiment to one hot encoding for training
y_train = pd.get_dummies(df_train['sentiment']).values[:,1:]

Processing tweets: 100%|██████████| 43013/43013 [01:07<00:00, 640.36tweet/s]


*   so now x_train is converted to tfidf and is all the features
*   whereas y_train are all the sentiments (negative and positive) and converted to 0 or 1 (one hot encoding)



---



**Preparing the Testing data**

In [16]:
tweets_test, df_test = loading_tweets('twitter_validation.csv')
tweets_test = pre_processing_tweets(tweets_test)

tweets_test = tf_idf.transform(tweets_test)

x_test = tweets_test.toarray()

# using one hot encoding and splitting to only get positive column (1 if positive 0 if negative)
y_test =  pd.get_dummies(df_test['sentiment']).values[:,1:]

Processing tweets: 100%|██████████| 543/543 [00:02<00:00, 193.17tweet/s]


Model Building and training

In [18]:
model = MultinomialNB()

model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


Model Evaluation

In [22]:
print(f'Training accuracy: {round(accuracy_score(model.predict(x_train), y_train)*100, 3)}')
print(f'Testing accuracy: {round(accuracy_score(model.predict(x_test), y_test)*100, 3)}')
# print(f'Testing accuracy')

Training accuracy: 89.738
Testing accuracy: 91.897


Saving the model and the Encoder  

In [23]:
joblib.dump(tf_idf, 'tfidf_model.joblib')
print('encoder has been saved tfidf_model.joblib')

pickle.dump(model, open('model.mdl','wb'))
print('model has been saved to model.mdl')

encoder has been saved tfidf_model.joblib
model has been saved to model.mdl
