In [2]:
!pip install kaggle




In [13]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [14]:
!kaggle --version


Kaggle API 1.7.4.5


In [15]:
!kaggle datasets download -d kazanova/sentiment140
# it is used to download .zip dataset from kaggle

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
  0% 0.00/80.9M [00:00<?, ?B/s]
100% 80.9M/80.9M [00:00<00:00, 1.50GB/s]


In [16]:
!unzip sentiment140.zip  # extracting the compressed dataset


Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [1]:
#importing the dependecies
import numpy as np
import pandas as pd
import re            # used for pattern matching
from nltk.corpus import stopwords  # we will use it to remove stopwords
from nltk.stem.porter import PorterStemmer  # reduce the words to asimple formate
from sklearn.feature_extraction.text import TfidfVectorizer  # using this we will convert text to vectors(numerial)
from sklearn.model_selection import train_test_split  # to split the data in train and test
from sklearn.linear_model import LogisticRegression # mode for learning
from sklearn.metrics import accuracy_score  # to calculate accuracy


ModuleNotFoundError: No module named 'pandas'

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# printing stopwords in english
print(stopwords.words('english'))  #general stop words that we don't need to understand the context of the sentence and we will remove from our dataset

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [20]:
# Data processing

In [21]:
# loading the data from csv file thrugh pandas dataframe
twitter_data  = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding  = 'ISO-8859-1')

In [22]:
#checking the number of rows and columns
twitter_data.shape # so there are 16lakh tweets and 6 columns of it

(1599999, 6)

In [23]:
# printing first 5 rows of dataframe
twitter_data.head() # we can see that the columns name is not read by pandas and hence we will change it to read the column names

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [24]:
# naming the columns and reading the dataset again
column_names = ['target', 'id', 'date', 'flag' , 'user' , 'text']
twitter_data  = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names = column_names,  encoding  = 'ISO-8859-1')

In [25]:
twitter_data.shape

(1600000, 6)

In [26]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [27]:
# counting the number of missing values in dataset
twitter_data.isnull().sum()  # no values are missing in dataset and hence no need to clean it

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [28]:
# checking the istribution of target columns
twitter_data['target'].value_counts()
# target is basically the sentimental label of each tweet
# 0 -> negative , 2 -> neutral , 4 -> positive
# and we can see that the data is distributed equally in 0 & 4
# if it wasn't equally distributed then we need to perform upsampling and downsampling

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [29]:
# we will convert the label of 4 to 1 so that 0 = negative and 1 = positive

twitter_data.replace({'target': {4:1}}, inplace = True)

# inplace = true saves the changes to the original dataset or otherwise it will create a new copy of the datset and make changes there

In [30]:
twitter_data['target'].value_counts() # now the target value of 4 is converted to 1

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


In [31]:
# 0 -> negative
# 1 -> positive

In [32]:
# Stemming is the process of reducing a word to its Root word

In [34]:
port_stem = PorterStemmer() # this will load the instance of PorterStemmer to variable

In [35]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content) # content is the text of the tweet and we are removing all the things(num, special char) which are not in a-z or A-Z and the ^ means (not)
  stemmed_content = stemmed_content.lower() #convert all the letters to lower case
  stemmed_conent = stemmed_content.split()  # split all the word in the tweet and store it in the list
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] # removing all stop words from the list(stemmed_content)
  stemmed_content = ' '.join(stemmed_content) # joining all the words back with space to form sentence

  return stemmed_content

In [None]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming) # creating new column and putting data of text after applying stemming funtion to it