**01 : Data Pre-processing and basic information about data**

<u>Importing required modules</u>

In [1]:
#Data pre-processing
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#Handling warnings
import warnings
warnings.filterwarnings("ignore")

<u>Loading data</u>

In [2]:
train = pd.read_csv("train.csv",encoding="latin") #training data
test = pd.read_csv("test.csv",encoding="latin") #testing data
train.shape,test.shape

((27481, 10), (4815, 9))

In [3]:
train.head(3)

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18


In [4]:
test.head(3)

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0


<u>Basic information about data</u>

In [5]:
train.info(),test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4815 entries, 0 to 4814
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            3534 non-null   object 
 1   text         

(None, None)

In [6]:
train.describe()

Unnamed: 0,Population -2020,Land Area (Km²),Density (P/Km²)
count,27481.0,27481.0,27481.0
mean,40184970.0,662173.0,357.686583
std,150494600.0,1807425.0,2013.750702
min,801.0,0.0,2.0
25%,1968001.0,22810.0,35.0
50%,8655535.0,111890.0,89.0
75%,28435940.0,527970.0,214.0
max,1439324000.0,16376870.0,26337.0


<u>Renaming columns</u>

In [7]:
train.columns

Index(['textID', 'text', 'selected_text', 'sentiment', 'Time of Tweet',
       'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)',
       'Density (P/Km²)'],
      dtype='object')

In [8]:
dataset = [train,test]
for dataset in dataset:
    dataset.rename(columns={"text":"tweet",
                            "Age of User":"user_age",
                            "Time of Tweet":"tweet_time",
                          "Population -2020":"population",
                         "Land Area (Km²)":"land_area",
                         "Density (P/Km²)":"density"},inplace=True)

In [9]:
train.columns,test.columns

(Index(['textID', 'tweet', 'selected_text', 'sentiment', 'tweet_time',
        'user_age', 'Country', 'population', 'land_area', 'density'],
       dtype='object'),
 Index(['textID', 'tweet', 'sentiment', 'tweet_time', 'user_age', 'Country',
        'population', 'land_area', 'density'],
       dtype='object'))

<u>Dropping redundant columns</u>

 - As you can see there is an extra column named "selected_text" in train dataset, so for data uniformity we will drop that column 

In [10]:
train.drop(columns="selected_text",inplace=True)

In [11]:
"selected_text" in train.columns

False

- "textID" is unique and will not contribute much in data analysis 

In [12]:
train.drop(columns="textID",inplace=True)
test.drop(columns="textID",inplace=True)

<u>Checking for duplicates</u>

In [13]:
train.duplicated().value_counts() #No duplicate records

False    27481
dtype: int64

<u>Checking for null values</u>

In [14]:
train.isnull().sum()

tweet         1
sentiment     0
tweet_time    0
user_age      0
Country       0
population    0
land_area     0
density       0
dtype: int64

Less than 10% of train records contain null values so we cam simply drop them

In [15]:
train.dropna(how="any",inplace=True)

In [16]:
train.isnull().sum()

tweet         0
sentiment     0
tweet_time    0
user_age      0
Country       0
population    0
land_area     0
density       0
dtype: int64

In [17]:
test.isnull().sum()

tweet         1281
sentiment     1281
tweet_time    1281
user_age      1281
Country       1281
population    1281
land_area     1281
density       1281
dtype: int64

Since test dataset is just used to for model validation, not for training model. We can simply drop the null values

In [18]:
test.dropna(how="any",inplace=True)

In [19]:
test.isnull().sum()

tweet         0
sentiment     0
tweet_time    0
user_age      0
Country       0
population    0
land_area     0
density       0
dtype: int64

<Data pre-processing>

<u>Data pre-processing</u>

- "tweet" column

In [20]:
train.tweet[:5]

0                  I`d have responded, if I were going
1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
Name: tweet, dtype: object

In [21]:
#Creating an object of PorterStemmer() for stemming purpose
stemmer = PorterStemmer()

In [22]:
#Function for processing texts/tweets 
def TextPreprocessing(text):
    #converting the data to lower case
    text = text.lower() 
    #removing special characters
    text = re.sub("[^a-z]"," ",text) 
    #removing stopwords and stemming to root words 
    text = [stemmer.stem(word) for word in nltk.word_tokenize(text) if word not in stopwords.words("english")]
    text = " ".join(text)
    return text

In [23]:
#Applying the TextPreprocessing function on tweets in train as well as test
train.tweet = train.tweet.apply(TextPreprocessing)
test.tweet = test.tweet.apply(TextPreprocessing)

In [24]:
train.tweet[:5]

0                       respond go
1          sooo sad miss san diego
2                       boss bulli
3              interview leav alon
4    son put releas alreadi bought
Name: tweet, dtype: object

- Other columns column

In [25]:
for col in train.select_dtypes("object").drop(columns="tweet").columns:
    if len(train[f"{col}"].value_counts())<10:
        print("_".center(30,"_"),f"\nColumn : {col}\n",train[f"{col}"].value_counts())
    else:
        print("_".center(30,"_"),f"\nColumn : {col}\n",train[f"{col}"].value_counts().head(10))

______________________________ 
Column : sentiment
 neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64
______________________________ 
Column : tweet_time
 morning    9161
noon       9160
night      9159
Name: tweet_time, dtype: int64
______________________________ 
Column : user_age
 0-20      4581
21-30     4580
46-60     4580
60-70     4580
70-100    4580
31-45     4579
Name: user_age, dtype: int64
______________________________ 
Column : Country
 Afghanistan                         149
Democratic Republic of the Congo    149
Egypt                               149
Ecuador                             149
Dominican Republic                  149
Albania                             149
Djibouti                            149
Denmark                             149
Czechia (Czech Republic)            149
Equatorial Guinea                   149
Name: Country, dtype: int64


Other columns are already in required format so we do not need to process them.

In [26]:
#Data after pre-processing looks like as follows

In [27]:
train.head(3)

Unnamed: 0,tweet,sentiment,tweet_time,user_age,Country,population,land_area,density
0,respond go,neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,sooo sad miss san diego,negative,noon,21-30,Albania,2877797,27400.0,105
2,boss bulli,negative,night,31-45,Algeria,43851044,2381740.0,18


In [28]:
test.head(3)

Unnamed: 0,tweet,sentiment,tweet_time,user_age,Country,population,land_area,density
0,last session day http twitpic com ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,shanghai also realli excit precis skyscrap gal...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,recess hit veroniqu branquinho quit compani shame,negative,night,31-45,Algeria,43851044.0,2381740.0,18.0


<u>Exporting/Saving processed data</u>

In [29]:
train.to_csv("processed_train",index=False)
test.to_csv("processed_test",index=False)

In [30]:
pd.read_csv("processed_train",keep_default_na=False).head(3)

Unnamed: 0,tweet,sentiment,tweet_time,user_age,Country,population,land_area,density
0,respond go,neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,sooo sad miss san diego,negative,noon,21-30,Albania,2877797,27400.0,105
2,boss bulli,negative,night,31-45,Algeria,43851044,2381740.0,18


In [31]:
pd.read_csv("processed_test",keep_default_na=False).head(3)

Unnamed: 0,tweet,sentiment,tweet_time,user_age,Country,population,land_area,density
0,last session day http twitpic com ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,shanghai also realli excit precis skyscrap gal...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,recess hit veroniqu branquinho quit compani shame,negative,night,31-45,Algeria,43851044.0,2381740.0,18.0


*Next -> 02_EDA*