In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from textblob import TextBlob
from nltk.corpus import stopwords  
from nltk.stem.porter import PorterStemmer 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

In [2]:
data_tweet=pd.read_csv("Elon_musk (1).csv",encoding="latin1",index_col=0)

# data visualization

In [3]:
data_tweet

Unnamed: 0,Text
1,@kunalb11 Im an alien
2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
3,@joerogan @Spotify Great interview!
4,@gtera27 Doge is underestimated
5,@teslacn Congratulations Tesla China for amazi...
...,...
1995,"@flcnhvy True, it sounds so surreal, but the n..."
1996,@PPathole Make sure to read ur terms &amp; con...
1997,@TeslaGong @PPathole Samwise Gamgee
1998,@PPathole Altho Dumb and Dumber is <U+0001F525...


In [4]:
type(data_tweet["Text"])

pandas.core.series.Series

In [12]:
data_tweet.isna().sum()

Text    0
dtype: int64

In [13]:
data_tweet=data_tweet.dropna()

### pre-processing the data

In [14]:
def preprocess(x):
    x=re.sub("[^a-zA-Z ]","",x)
    x=x.lower()
    x=x.split()
    x=[word for word in x if word not in set(stopwords.words("english"))]
    x=[PorterStemmer().stem(word) for word in x]
    x=" ".join(x)
    return x

In [15]:
data_tweet["Text"]=data_tweet["Text"].apply(preprocess)

In [16]:
data_tweet

Unnamed: 0,Text
1,kunalb im alien
2,idaacarmack ray trace cyberpunk hdr nextlevel tri
3,joerogan spotifi great interview
4,gtera doge underestim
5,teslacn congratul tesla china amaz execut last...
...,...
1995,flcnhvi true sound surreal neg propaganda stil...
1996,ppathol make sure read ur term amp condit clic...
1997,teslagong ppathol samwi gamg
1998,ppathol altho dumb dumber ufuf


### polarity of data

In [34]:
 def getPolarity(text):
   return TextBlob(text).sentiment.polarity

In [35]:
data_tweet['polarity']=data_tweet['Text'].apply(getPolarity)

In [36]:
data_tweet

Unnamed: 0,Text,polarity
1,kunalb im alien,-0.250000
2,idaacarmack ray trace cyberpunk hdr nextlevel tri,0.000000
3,joerogan spotifi great interview,0.800000
4,gtera doge underestim,0.000000
5,teslacn congratul tesla china amaz execut last...,0.000000
...,...,...
1995,flcnhvi true sound surreal neg propaganda stil...,0.186667
1996,ppathol make sure read ur term amp condit clic...,0.500000
1997,teslagong ppathol samwis gamge,0.000000
1998,ppathol altho dumb dumber ufuf,-0.375000


In [37]:
def getAnalysis(score):
  if score < 0:
    return 'Negative'
  elif score == 0:
    return 'Neutral'
  else:
    return 'Positive'

In [38]:
data_tweet['sentiment']=data_tweet['polarity'].apply(getAnalysis)

In [39]:
data_tweet

Unnamed: 0,Text,polarity,sentiment
1,kunalb im alien,-0.250000,Negative
2,idaacarmack ray trace cyberpunk hdr nextlevel tri,0.000000,Neutral
3,joerogan spotifi great interview,0.800000,Positive
4,gtera doge underestim,0.000000,Neutral
5,teslacn congratul tesla china amaz execut last...,0.000000,Neutral
...,...,...,...
1995,flcnhvi true sound surreal neg propaganda stil...,0.186667,Positive
1996,ppathol make sure read ur term amp condit clic...,0.500000,Positive
1997,teslagong ppathol samwis gamge,0.000000,Neutral
1998,ppathol altho dumb dumber ufuf,-0.375000,Negative


In [40]:
data_tweet['sentiment'].value_counts()

Neutral     1189
Positive     649
Negative     161
Name: sentiment, dtype: int64

In [41]:
data_tweet["sentiment"]=data_tweet["sentiment"].map({"Negative":-1,"Positive":1,"Neutral":0})

In [42]:
data_tweet

Unnamed: 0,Text,polarity,sentiment
1,kunalb im alien,-0.250000,-1
2,idaacarmack ray trace cyberpunk hdr nextlevel tri,0.000000,0
3,joerogan spotifi great interview,0.800000,1
4,gtera doge underestim,0.000000,0
5,teslacn congratul tesla china amaz execut last...,0.000000,0
...,...,...,...
1995,flcnhvi true sound surreal neg propaganda stil...,0.186667,1
1996,ppathol make sure read ur term amp condit clic...,0.500000,1
1997,teslagong ppathol samwis gamge,0.000000,0
1998,ppathol altho dumb dumber ufuf,-0.375000,-1


In [43]:
data_tweet.drop(columns="polarity")

Unnamed: 0,Text,sentiment
1,kunalb im alien,-1
2,idaacarmack ray trace cyberpunk hdr nextlevel tri,0
3,joerogan spotifi great interview,1
4,gtera doge underestim,0
5,teslacn congratul tesla china amaz execut last...,0
...,...,...
1995,flcnhvi true sound surreal neg propaganda stil...,1
1996,ppathol make sure read ur term amp condit clic...,1
1997,teslagong ppathol samwis gamge,0
1998,ppathol altho dumb dumber ufuf,-1


### count vectorizer

In [44]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

cv=CountVectorizer()
X=cv.fit_transform(data_tweet.Text.values).toarray()
X=pd.DataFrame(X,columns=cv.get_feature_names())



In [45]:
X

Unnamed: 0,aber,abl,abo,aboard,abort,absenc,absolut,absorb,absorpt,absurd,...,zero,zikryzamir,zip,zogfotpik,zon,zone,zshauladventur,zubinanari,zwiebelbach,zzcool
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
x_train,x_test,y_train,y_test=train_test_split(X,data_tweet.sentiment,test_size=0.3,random_state=10)

# Decision tree

In [47]:
models=[DecisionTreeClassifier(random_state=10),LogisticRegression()]
for model in models:
    model.fit(x_train,y_train)
    pred=model.predict(x_test)
    print(model," accuracy :: ",metrics.accuracy_score(pred,y_test))

DecisionTreeClassifier(random_state=10)  accuracy ::  0.9083333333333333
LogisticRegression()  accuracy ::  0.8383333333333334


In [None]:
model