# Connecting Google Drive

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Connecting Kaggle

In [2]:
# Install kaggle
!pip install -q kaggle

In [3]:
# upload kaggle.json file which we downloaded
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"arbazkhancs","key":"140450ae4ad01edb47d18c912d681ad3"}'}

In [4]:
# Create A Kaggle Folder
! mkdir ~/.kaggle

In [5]:
# Copy the kaggle.json file to the above folder
! cp kaggle.json ~/.kaggle/

In [6]:
# Permission for the json to act
! chmod 600 ~/.kaggle/kaggle.json

In [7]:
# to list all kaggle datasets
! kaggle datasets list

ref                                                     title                                          size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------------------------------------  --------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
syedanwarafridi/vehicle-sales-data                      Vehicle Sales Data                             19MB  2024-02-21 20:16:17           4645         76  1.0              
nelgiriyewithana/apple-quality                          Apple Quality                                 170KB  2024-01-11 14:31:07          24087        519  1.0              
kanchana1990/spotifys-long-hits-2014-2024               Spotify's Long Hits (2014-2024) 🎶              38KB  2024-02-23 12:39:02           1722         44  1.0              
xontoloyo/data-penjualan-zara                           ZARA Sales                                     17KB  2024-02-27 05:37:53  

# Downloading the Dataset

In [8]:
! kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis

Downloading twitter-entity-sentiment-analysis.zip to /content
  0% 0.00/1.99M [00:00<?, ?B/s]
100% 1.99M/1.99M [00:00<00:00, 86.5MB/s]


In [9]:
# unzip the downloaded file
! unzip twitter-entity-sentiment-analysis.zip

Archive:  twitter-entity-sentiment-analysis.zip
  inflating: twitter_training.csv    
  inflating: twitter_validation.csv  


# Text Preprocessing

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [11]:
df = pd.read_csv("twitter_training.csv", header=None, names=['ID', 'Game', 'Sentiment', 'Tweet'])

In [12]:
df.head()

Unnamed: 0,ID,Game,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         74682 non-null  int64 
 1   Game       74682 non-null  object
 2   Sentiment  74682 non-null  object
 3   Tweet      73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [14]:
df.shape

(74682, 4)

In [15]:
# Select only required columns
df = df.drop(columns=["ID", "Game"])

In [16]:
df.head()

Unnamed: 0,Sentiment,Tweet
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


# Preprocessing

In [51]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

! pip install emoji
import emoji
import re

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

class preProcess:
    def __init__(self, Dataframe):
        self.Dataframe = Dataframe.copy()
        # Drop all null values
        self.Dataframe.dropna(inplace=True)

        self.tweet = []
        self.sentiments = None


    def process_Sentiment(self):
        mapping = {}
        count = 0
        for i in self.Dataframe["Sentiment"].unique():
            mapping[i] = count
            count += 1

        print(mapping)
        self.Dataframe["Sentiment"] = self.Dataframe["Sentiment"].apply(lambda x: mapping[x])
        self.sentiments = np.array(self.Dataframe["Sentiment"])
        return self.sentiments


    def process_Tweet(self):
        stop_words = set(stopwords.words('english'))
        stemmer = PorterStemmer()

        processed_tweets = []

        for line in tqdm(self.Dataframe["Tweet"]):
            # Remove emojis
            line = emoji.demojize(line)

            # Remove URLs
            line = re.sub(r'http\S+|www\S+', '', line)

            # Remove punctuations except apostrophes
            line = re.sub(r'[^\w\s\']', '', line)

            # Tokenize the line
            tokens = nltk.word_tokenize(line)

            # Remove stop words, single-character words, and perform stemming
            filtered_tokens = [stemmer.stem(token) for token in tokens if token.lower() not in stop_words and len(token) > 1]

            # Join tokens back into a string
            processed_line = ' '.join(filtered_tokens)

            processed_tweets.append(processed_line)

        self.Dataframe["Tweet"] = processed_tweets


    def creat_Tokenizer(self):
        # creating tokenizer
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.Dataframe["Tweet"])
        print(tokenizer.word_index)

        self.tweet = tokenizer.texts_to_sequences(self.Dataframe["Tweet"])
        print(self.tweet[0])


    def do_pad_sequence(self):
        # pad sequences
        self.tweet = pad_sequences(self.tweet, padding='post')  # Pad sequences

        print(self.tweet[0])  # Print the first padded sequence



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [52]:
# create object of preProcess class
obj = preProcess(df)

In [53]:
# process the Sentiment
obj.process_Sentiment()

{'Positive': 0, 'Neutral': 1, 'Negative': 2, 'Irrelevant': 3}


array([0, 0, 0, ..., 0, 0, 0])

In [54]:
# process the tweets
obj.process_Tweet()

100%|██████████| 73996/73996 [00:58<00:00, 1268.31it/s]


In [55]:
# create tokenizer
obj.creat_Tokenizer()

[53, 5, 69, 1174]


In [56]:
# make all same length
obj.do_pad_sequence()

[  53    5   69 1174    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0]


# Training Dataset

In [57]:
X_train = obj.tweet
y_train = obj.sentiments

In [58]:
X_train.shape, y_train.shape

((73996, 131), (73996,))

In [59]:
X_train[0]

array([  53,    5,   69, 1174,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
      dtype=int32)

In [60]:
y_train[20:30]

array([0, 0, 0, 0, 2, 2, 2, 2, 2, 2])

# Save The Training Dataset

In [61]:
from pickle import dump

In [62]:
# Dump X_train into a pickle file
with open("/content/drive/MyDrive/Colab Notebooks/Sentiment Anyalasis/X_train.pkl", "wb") as f:
    dump(X_train, f)

In [63]:
# Dump y_train into a pickle file
with open("/content/drive/MyDrive/Colab Notebooks/Sentiment Anyalasis/y_train.pkl", "wb") as f:
    dump(y_train, f)