In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/twitter-sentiment-dataset/Twitter_Data.csv


**Importing Dependencies**

In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
# download stop_word
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Load data to the Data Frame**

In [4]:
data = pd.read_csv('/kaggle/input/twitter-sentiment-dataset/Twitter_Data.csv')

**Data preprocessing**

In [5]:
# Shape of the data
data.shape

(162980, 2)

In [6]:
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [7]:
# check for missind values

data.isna().values.any()

True

In [8]:
# columns with missind values

data.isnull().sum()

clean_text    4
category      7
dtype: int64

In [9]:
# Actual missing values in the columns

data[data.clean_text.isnull() == data.isnull().values.any()]

Unnamed: 0,clean_text,category
148,,0.0
158694,,-1.0
159443,,0.0
160560,,1.0


In [10]:
data[data.category.isnull() == data.isnull().values.any()]

Unnamed: 0,clean_text,category
130448,the foundation stone northeast gas grid inaugu...,
155642,dear terrorists you can run but you cant hide ...,
155698,offense the best defence with mission shakti m...,
155770,have always heard politicians backing out thei...,
158693,modi government plans felicitate the faceless ...,
159442,chidambaram gives praises modinomics,
160559,the reason why modi contested from seats 2014 ...,


In [11]:
# Drop missing values

data = data.dropna()
data.isna().any()

clean_text    False
category      False
dtype: bool

In [12]:
data.shape

(162969, 2)

**EDA**

In [13]:
category_count = data.category.value_counts().reset_index()

import plotly.express as ps

figure = ps.pie(category_count, names=['Positive', 'Neutral', 'Negative'], values='count', title='Count of Sentiments', hole=0.4)
figure.show()

In [14]:
ps = PorterStemmer()

In [15]:
def stemming(content):
    
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in set(stopwords.words('english'))]
    stemmed_content = ' '.join(stemmed_content)
    
    return stemmed_content

In [16]:
# create new columns to the data set called stemmed_text

data['stemmed_text'] = data.clean_text.apply(stemming)

In [17]:
X = data.stemmed_text
y = data.category

In [18]:
X.shape

(162969,)

In [19]:
# Splitinf the data for train and test data respectively

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2,stratify=y, random_state=10)

In [20]:
# converting the textual data to vectors or numerical data.

vectorizer = TfidfVectorizer()

train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

**Create and Evaluate The Model**

In [21]:
model = LogisticRegression(max_iter=1000)

In [22]:
model.fit(train_X, train_y)

In [23]:
train_predictions = model.predict(train_X)
test_predictions = model.predict(test_X)

In [24]:
# check the model score for train data

accuracy_score_train = accuracy_score(train_y, train_predictions)
confusion_matrix_train = confusion_matrix(train_y, train_predictions)

print("accuracy_score train: ", accuracy_score_train)
print("confusion_matrix train: ", confusion_matrix_train)

accuracy_score train:  0.8791025886864813
confusion_matrix train:  [[22072  3011  3324]
 [  987 41272  1910]
 [ 2206  4324 51269]]


In [25]:
# check the model score for test data

accuracy_score_test = accuracy_score(test_y, test_predictions)
confusion_matrix_test = confusion_matrix(test_y, test_predictions)

print("accuracy_score train: ", accuracy_score_test)
print("confusion_matrix train: ", confusion_matrix_test)

accuracy_score train:  0.8429465545805976
confusion_matrix train:  [[ 5039   985  1078]
 [  367 10034   641]
 [  758  1290 12402]]
