# tweeter sentiment analysis

## import libraries and load_datasets

In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
df_train = pd.read_csv('train_E6oV3lV.csv')
df_test = pd.read_csv('test_tweets_anuFYb8.csv')
df_train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
df_test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [4]:
pd.read_csv('sample_submission_gfvA5FD.csv').head()

Unnamed: 0,id,label
0,31963,0
1,31964,0
2,31965,0
3,31966,0
4,31967,0


In [5]:
df_train.shape

(31962, 3)

In [6]:
#drop null values
df_train = df_train.dropna()
df_train.shape

(31962, 3)

In [7]:
#get the independent feature
x = df_train.drop('label',axis=1)
#get dependent feature
y = df_train.label

In [8]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [9]:
x['tweet'][22]

"product of the day: happy man #wine tool  who's   it's the #weekend? time to open up &amp; drink up!"

In [10]:
ps = PorterStemmer()
corpus = []

for i in range(0, len(x)):
    review = re.sub('[^a-zA-Z]', ' ', x['tweet'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    
    
corpus1 = []

for i in range(0, len(df_test)):
    review = re.sub('[^a-zA-Z]', ' ', df_test['tweet'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus1.append(review)

In [12]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
x = tfidf.fit_transform(corpus).toarray()
test = tfidf.fit_transform(corpus1).toarray()

In [13]:
print(x.shape)
print(test.shape)

(31962, 5000)
(17197, 5000)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=0)

In [15]:
tfidf.get_feature_names()[:10]

['aap',
 'aap spokesperson alkalamba',
 'ab',
 'abl',
 'abl see',
 'abrahamhick',
 'abrahamhick lawofattract',
 'abrahamhick lawofattract healthi',
 'absolut',
 'abt']

In [16]:
tfidf.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [17]:
count_df = pd.DataFrame(X_train, columns=tfidf.get_feature_names())
count_df.head() 

Unnamed: 0,aap,aap spokesperson alkalamba,ab,abl,abl see,abrahamhick,abrahamhick lawofattract,abrahamhick lawofattract healthi,absolut,abt,...,yoy,yr,yr old,yum,yummi,yyc,zero,zombi,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
count_test = pd.DataFrame(test, columns=tfidf.get_feature_names())
count_test.head()

Unnamed: 0,aap,aap spokesperson alkalamba,ab,abl,abl see,abrahamhick,abrahamhick lawofattract,abrahamhick lawofattract healthi,absolut,abt,...,yoy,yr,yr old,yum,yummi,yyc,zero,zombi,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### MultinomialNB Algorithm

In [20]:
minb_model = MultinomialNB()
minb_model.fit(X_train, y_train)
minb_model.score(X_test, y_test)

0.95060675009480466

In [23]:
minb_pred = minb_model.predict(X_test)
confusion_matrix(y_test, minb_pred)

NameError: name 'confusion_matrix' is not defined

### submission file

In [25]:
prediction = minb_model.predict(count_test)
prediction = prediction.astype(int)

In [28]:
la = pd.DataFrame(prediction)
r = pd.concat([df_test.id, la], axis=1)
r.columns=['id', 'label']
r.head()

Unnamed: 0,id,label
0,31963,0
1,31964,0
2,31965,0
3,31966,0
4,31967,0


In [29]:
r.label.value_counts()

0    16496
1      701
Name: label, dtype: int64

In [30]:
r.to_csv('sub1.csv')