In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings

warnings.simplefilter("ignore")

In [3]:
twitter_df = pd.read_csv("twitter_data.csv")
twitter_df = twitter_df.rename(columns={"clean_text": "Sentence"})
twitter_df.head(10)

Unnamed: 0,Sentence,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace f...,0.0
7,this comes from cabinet which has scholars lik...,0.0
8,with upcoming election india saga going import...,1.0
9,gandhi was gay does modi,1.0


In [4]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Sentence  162976 non-null  object 
 1   category  162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [5]:
twitter_posts = twitter_df.dropna()
twitter_posts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162969 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Sentence  162969 non-null  object 
 1   category  162969 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB


In [6]:
twitter_posts.isnull().sum()

Sentence    0
category    0
dtype: int64

In [7]:
twitter_posts.duplicated().sum()

0

In [8]:
def decode_sentiment(sentiment):
    if sentiment == -1.0:
        return "negative"
    elif sentiment == 0.0:
        return "neutral"
    else:
        return "positive"

In [9]:
twitter_posts["Sentiment"] = twitter_posts["category"].apply(lambda sentiment: decode_sentiment(sentiment))

twitter_posts.head(10)

Unnamed: 0,Sentence,category,Sentiment
0,when modi promised “minimum government maximum...,-1.0,negative
1,talk all the nonsense and continue all the dra...,0.0,neutral
2,what did just say vote for modi welcome bjp t...,1.0,positive
3,asking his supporters prefix chowkidar their n...,1.0,positive
4,answer who among these the most powerful world...,1.0,positive
5,kiya tho refresh maarkefir comment karo,0.0,neutral
6,surat women perform yagna seeks divine grace f...,0.0,neutral
7,this comes from cabinet which has scholars lik...,0.0,neutral
8,with upcoming election india saga going import...,1.0,positive
9,gandhi was gay does modi,1.0,positive


In [10]:
twitter_posts = twitter_posts.drop(columns=["category"])
twitter_posts.head(10)

Unnamed: 0,Sentence,Sentiment
0,when modi promised “minimum government maximum...,negative
1,talk all the nonsense and continue all the dra...,neutral
2,what did just say vote for modi welcome bjp t...,positive
3,asking his supporters prefix chowkidar their n...,positive
4,answer who among these the most powerful world...,positive
5,kiya tho refresh maarkefir comment karo,neutral
6,surat women perform yagna seeks divine grace f...,neutral
7,this comes from cabinet which has scholars lik...,neutral
8,with upcoming election india saga going import...,positive
9,gandhi was gay does modi,positive


In [11]:
financial_df = pd.read_csv("financial_sentiment.csv")
financial_df.head(10)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
5,$SPY wouldn't be surprised to see a green close,positive
6,Shell's $70 Billion BG Deal Meets Shareholder ...,negative
7,SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANG...,negative
8,Kone 's net sales rose by some 14 % year-on-ye...,positive
9,The Stockmann department store will have a tot...,neutral


In [12]:
financial_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 0 to 5841
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentence   5842 non-null   object
 1   Sentiment  5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB


In [13]:
sentiment_df = pd.concat([twitter_posts, financial_df], axis=0, ignore_index=True)
sentiment_df.head(10)

Unnamed: 0,Sentence,Sentiment
0,when modi promised “minimum government maximum...,negative
1,talk all the nonsense and continue all the dra...,neutral
2,what did just say vote for modi welcome bjp t...,positive
3,asking his supporters prefix chowkidar their n...,positive
4,answer who among these the most powerful world...,positive
5,kiya tho refresh maarkefir comment karo,neutral
6,surat women perform yagna seeks divine grace f...,neutral
7,this comes from cabinet which has scholars lik...,neutral
8,with upcoming election india saga going import...,positive
9,gandhi was gay does modi,positive


In [14]:
sentiment_df.tail(10)

Unnamed: 0,Sentence,Sentiment
168801,Operating profit fell to EUR 38.1 mn from EUR ...,negative
168802,"In 2008 , Kemira recorded revenue of approxima...",neutral
168803,Investments in product development stood at 6....,neutral
168804,HSBC Says Unit to Book $585 Million Charge on ...,negative
168805,Daily Mail parent company in talks with potent...,positive
168806,RISING costs have forced packaging producer Hu...,negative
168807,Nordic Walking was first used as a summer trai...,neutral
168808,"According shipping company Viking Line , the E...",neutral
168809,"In the building and home improvement trade , s...",neutral
168810,HELSINKI AFX - KCI Konecranes said it has won ...,positive


In [15]:
sentiment_df_copy = sentiment_df.sample(frac=1, random_state=42).reset_index(drop=True)
sentiment_df_copy.sample(10)

Unnamed: 0,Sentence,Sentiment
87669,anyone upa didn’ give positive response missio...,positive
31624,wish karma ignoring bhagawan ram and gomatha c...,neutral
163878,you mean taarikh taarikh will wonderful see mo...,positive
111440,rahul goes muslim dominated constituency congr...,neutral
2730,hahahahahahaha they got their ass whooped paki...,negative
113136,but modi isnt fool,neutral
168557,congress first family wont back power modi tim...,positive
128842,dear kumar swamy mandya you telling for nikil ...,neutral
142673,honestly think must appreciate modi does even ...,negative
72886,think arnab shuld ask modi whether likes dhosa...,positive


In [16]:
tweets_df = pd.read_csv("tweets3.csv")
tweets_df.sample(10)

Unnamed: 0,textID,text,selected_text,sentiment
5211,6b74e03b6d,Oh I`ve got that one & the stp x step one on ...,Oh I`ve got that one & the stp x step one on V...,neutral
3174,60c374b906,Is eating BBQ Jalapeno Torta Subway from Los C...,Is eating BBQ Jalapeno Torta Subway from Los C...,neutral
2943,dc91cd5158,Dell FX100 Pc-over-IP audio device (Teradici) ...,hard,negative
3419,d438d00c44,What the hell Ross?! Where is Hugh Laurie! He...,What the hell Ross?! Where is Hugh Laurie! He ...,neutral
15449,1dec7590cb,hehee!! yea its supposed to sound mean.. hahhaa,supposed to sound mean.,negative
15721,d136417805,I`m sorry...I`ll make sure I do that next time.,sorry.,negative
26390,6b9f7a3faf,i need a jb but i dread the fact that it will ...,dread,negative
13413,e1eac039b5,DAMMIT! lets have a private session,DAMMIT!,negative
12486,0a887ad3bc,Yao...broken foot...so much for that series D...,broken,negative
1187,3474d5dce1,"outta the shower, too bad justin couldn`t spen...",bad,negative


In [17]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [18]:
new_tweets = tweets_df.dropna()
new_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27480 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27480 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27480 non-null  object
dtypes: object(4)
memory usage: 1.0+ MB


In [19]:
new_tweets.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [20]:
new_tweets.duplicated().sum()

0

In [22]:
new_tweets_copy = new_tweets[["text", "sentiment"]].rename(columns={"text": "Sentence", "sentiment": "Sentiment"})
new_tweets_copy.head()

Unnamed: 0,Sentence,Sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [23]:
new_tweets_copy["Sentiment"].value_counts()

Sentiment
neutral     11117
positive     8582
negative     7781
Name: count, dtype: int64

In [24]:
final_df = pd.concat([sentiment_df_copy, new_tweets_copy], axis=0, ignore_index=True)
final_df_copy = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [25]:
final_df_copy.head()

Unnamed: 0,Sentence,Sentiment
0,lok sabha election live modi address rallies t...,positive
1,live public meeting,positive
2,any one who asks modi rahul unconstitutional a...,neutral
3,pmopm modi kickstarts bjp’ campaign for lok sa...,neutral
4,nehru died 1964 isro was estd 1969 overenthusi...,neutral


In [26]:
final_df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196291 entries, 0 to 196290
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Sentence   196291 non-null  object
 1   Sentiment  196291 non-null  object
dtypes: object(2)
memory usage: 3.0+ MB


In [27]:
final_df_copy["Sentiment"].value_counts()

Sentiment
positive    82683
neutral     69458
negative    44150
Name: count, dtype: int64

In [28]:
final_df_copy.to_csv('twitter_posts_sentiment_new.csv', index=False)