### PANDAS TO READ CSV FILE NUMPY FOR ANY USE CASE

In [None]:
import pandas as pd
import numpy as np

In [None]:
train_data = pd.read_csv('train.csv')

In [None]:
train_data.head(3)

Lets Know the size of our training

In [None]:
size = train_data.shape[0]
print(size)

In [None]:
import seaborn as sns
sns.countplot(train_data['sentiment'])

### LIBRARIES FOR PREPROCESSING TEXT

In [None]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

### PREPRAING CORPUS OUT OF THE TEXT
I created a function corpus_tweets to create a corpus of sentences containing lametized words . 
<br> Creating the function would ease the work further .
<br> The corpus will exactly have the tweets converted into list of tweets where the words are lametized. 

In [None]:
def prepare_corpus(tweets):
  corpus_tweets = []
  size = tweets.shape[0]
  ps = PorterStemmer()
  for i in range(0,size):
    tweet = re.sub(pattern='[^a-zA-Z]',repl=' ', string=tweets['message'][i])

    tweet = re.sub(pattern='user' , repl='' , string = tweet)

    tweet = tweet.lower()

    words = tweet.split()

    words = [ps.stem(word) for word in words if not word in stopwords.words('english')]

    tweet = ' '.join(words)

    corpus_tweets.append(tweet)
  return corpus_tweets

corpus_tweets_train = prepare_corpus(train_data)

In [None]:
corpus_tweets_train[0:2]

### TFIDF Vectorizer
Its is required to convert the corpus into meanigful sum of numbers . 
<br>TFIDF vectorizer performs well text preprocessing than count vectorizer . 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=7000)
X_tfidf = tfidf.fit_transform(corpus_tweets_train).toarray()
y_ifidf = train_data['sentiment'].values

In [None]:
X_tfidf[0:2]

### SPLITTING THE DATA FOR TRAIN AND TEST
Using the train test split X and y are splitted to 80:20 ratio . 

In [None]:
from sklearn.model_selection import train_test_split
def split_train_test(X,y):
  X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.3)
  return X_train , X_test , y_train , y_test

X_train_idf , X_test_idf , y_train_idf , y_test_idf = split_train_test(X_tfidf, y_ifidf)

### ACCURACY AND CLASSIFICATION REPORT FUNCTION

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
def accuracy_check(model,data,label):
  y_pred = model.predict(data)
  print(classification_report(label , y_pred)) 
  accuracy = accuracy_score(label , y_pred)
  return accuracy

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_idf = MultinomialNB()
nb_idf.fit(X_train_idf , y_train_idf)
nb_idf_accuracy = accuracy_check(nb_idf , X_test_idf , y_test_idf)
print(nb_idf_accuracy)

### MODEL PERFORMANCE
The accuracy hit is nearly 67% on the test data . 
<br> For the final model the complete data could be provided . 

#### A FUNCTION TO KNOW APPROPRIATE VALUE OF ALPHA(Hyperparameter)

In [None]:
def optimization_idf(X_train_idf , X_test_idf , y_train_idf , y_test_idf):
  best_accuracy = 0.0
  alpha_val = 0.0
  for i in np.arange(0.1,1.1,0.1):
    temp_classifier = MultinomialNB(alpha=i)
    temp_classifier.fit(X_train_idf, y_train_idf)
    temp_y_pred = temp_classifier.predict(X_test_idf)
    score = accuracy_score(y_test_idf, temp_y_pred)
    print("Accuracy score for alpha={} is: {}%".format(round(i,1), round(score*100,2)))
    if score>best_accuracy:
      best_accuracy = score
      alpha_val = i
  print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 2), round(alpha_val,1)))
  return alpha_val

optimal_value_idf = optimization_idf(X_train_idf , X_test_idf , y_train_idf , y_test_idf)

In [None]:
ml_model_final = MultinomialNB(alpha = 0.2)
ml_model_final.fit(X_tfidf , y_ifidf)

### MAKING PREDICTIONS FOR THE TEST DATA

In [None]:
test_data = pd.read_csv('test.csv')

In [None]:
test_data.head(3)

<br>The smae function use to convert into corpus . 
<br>The tfidf defined previously used for transformation . 

In [None]:
corpus_test = prepare_corpus(test_data)
vectors = tfidf.transform(corpus_test).toarray()

In [None]:
answer = ml_model_final.predict(vectors)

In [None]:
submission = test_data
submission.head(3)

In [None]:
submission['sentiment'] = answer

### FINAL CHECKS

In [None]:
submission.head()

In [None]:
ones = [ans for ans in answer if ans==1]
len(ones)

In [None]:
import seaborn as sns
sns.countplot(submission['sentiment'])

In [None]:
prediction = submission.filter(['tweetid','sentiment'], axis=1)

In [None]:
prediction.to_csv('submissionnb.csv' , index=False)