In [26]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
import re

In [3]:
em=pd.read_csv("Elon_musk.csv",encoding='unicode_escape')

In [4]:
em.head()

Unnamed: 0.1,Unnamed: 0,Text
0,1,@kunalb11 Iﾂ知 an alien
1,2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2,3,@joerogan @Spotify Great interview!
3,4,@gtera27 Doge is underestimated
4,5,@teslacn Congratulations Tesla China for amazi...


# EDA

In [5]:
em.duplicated().sum()

0

In [6]:
em.isna().sum()

Unnamed: 0    0
Text          0
dtype: int64

# Dropping the Unnamed column

In [7]:
em=em.drop(columns=('Unnamed: 0'),axis=1)

In [8]:
em.head()

Unnamed: 0,Text
0,@kunalb11 Iﾂ知 an alien
1,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2,@joerogan @Spotify Great interview!
3,@gtera27 Doge is underestimated
4,@teslacn Congratulations Tesla China for amazi...


# Data preprocessing

Remove space and symbols in Text column

In [9]:
em.Text=em.Text.apply(lambda x: re.sub('[^a-zA-Z]'," ",x))

In [10]:
em.head()

Unnamed: 0,Text
0,kunalb I m an alien
1,ID AA Carmack Ray tracing on Cyberpunk with H...
2,joerogan Spotify Great interview
3,gtera Doge is underestimated
4,teslacn Congratulations Tesla China for amazi...


Make Text from upper case to lower case

In [11]:
em.Text=em.Text.apply(lambda x: x.lower())

In [12]:
em.head()

Unnamed: 0,Text
0,kunalb i m an alien
1,id aa carmack ray tracing on cyberpunk with h...
2,joerogan spotify great interview
3,gtera doge is underestimated
4,teslacn congratulations tesla china for amazi...


Splitting the WORDS from line

In [13]:
em.Text=em.Text.apply(lambda x: x.split())

In [14]:
em.head()

Unnamed: 0,Text
0,"[kunalb, i, m, an, alien]"
1,"[id, aa, carmack, ray, tracing, on, cyberpunk,..."
2,"[joerogan, spotify, great, interview]"
3,"[gtera, doge, is, underestimated]"
4,"[teslacn, congratulations, tesla, china, for, ..."


Removing the stop words

In [15]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [16]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [17]:
em.Text=em.Text.apply(lambda x:[word for word in x if word not in set(stopwords.words("english"))])

In [18]:
em.head()

Unnamed: 0,Text
0,"[kunalb, alien]"
1,"[id, aa, carmack, ray, tracing, cyberpunk, hdr..."
2,"[joerogan, spotify, great, interview]"
3,"[gtera, doge, underestimated]"
4,"[teslacn, congratulations, tesla, china, amazi..."


Stemming

In [19]:
prs=PorterStemmer()
em.Text=em.Text.apply(lambda x:[prs.stem(word) for word in x])

In [20]:
em.head()

Unnamed: 0,Text
0,"[kunalb, alien]"
1,"[id, aa, carmack, ray, trace, cyberpunk, hdr, ..."
2,"[joerogan, spotifi, great, interview]"
3,"[gtera, doge, underestim]"
4,"[teslacn, congratul, tesla, china, amaz, execu..."


Rejoining

In [21]:
em['Text']=em['Text'].apply(lambda x:" ".join(x))

In [22]:
em.head()

Unnamed: 0,Text
0,kunalb alien
1,id aa carmack ray trace cyberpunk hdr next lev...
2,joerogan spotifi great interview
3,gtera doge underestim
4,teslacn congratul tesla china amaz execut last...


Sentiment Analysis

In [23]:
from textblob import TextBlob
em['smt']=em['Text'].apply(lambda x: TextBlob(x).sentiment[0])

In [24]:
em.head()

Unnamed: 0,Text,smt
0,kunalb alien,-0.25
1,id aa carmack ray trace cyberpunk hdr next lev...,0.0
2,joerogan spotifi great interview,0.8
3,gtera doge underestim,0.0
4,teslacn congratul tesla china amaz execut last...,0.0
