In [None]:
pip install transformers

In [None]:
pip install raceplotly

In [None]:
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import date
import calendar
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from raceplotly.plots import barplot
import nltk
nltk.download('stopwords')

from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [None]:
data=pd.read_csv('/content/reddit_wsb.csv')
#uncomend the line below if you are facing out of RAM issue, just to see how the code works
#data=data.iloc[0:2500]
#converting the timestamp column in our data to date time format, which will makes us to perform better analysis
data[['timestamp']] = data[['timestamp']].apply(pd.to_datetime)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48260 entries, 0 to 48259
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   title      48260 non-null  object        
 1   score      48260 non-null  int64         
 2   id         48260 non-null  object        
 3   url        48260 non-null  object        
 4   comms_num  48260 non-null  int64         
 5   created    48260 non-null  float64       
 6   body       22521 non-null  object        
 7   timestamp  48260 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 2.9+ MB



*   Replacing nan values with blank space, because we will be merging title column and body column so that we get broader range of words to get the perfect accuracy and the 'body' column contains many nan values. Which can be figured out by after running previous cell. Wherein, body column contains only 22521 non-null values remaning all are nan values.
nan values contribute to 52.33% of total values in body column. 

*   dropping 'score','id' and 'created' column as they do not serve any purpose in determining the sentiments


In [None]:
data = data.replace(np.nan, '', regex=True)
data.drop(['score','id','created'],axis=1, inplace=True)

Getting the total number of words in 'title' column foro each record

In [None]:
data['length']=data['title'].apply(lambda x : len(x.split(' ')))

Creating a function 'weekdy' to get the weekdays from timestamp column on which the post has been posted

In [None]:
def weekdy(x):
    my_date = x
    return calendar.day_name[my_date.weekday()]
data['weekdays']=data['timestamp'].apply(lambda x : weekdy(x))

**What is the distribution of title length in the data??**


In [None]:
px.histogram(data,x='length',template='seaborn')

Most of the posts are of length that lies between 0-20 . Why is it like that , well title's are kept short to just give you of a gist of what lies in the body so it doesn't need to be too long :)

**Post Distribution In Reddit according to days in a Week In Wall Street**

In [None]:
px.histogram(data,x='weekdays',color='weekdays',template='seaborn')

Preprocessing a given text before feeding into model is always a good practice, because our texts include many emoticons, urls, websites, hashtags which don't contribute to sentiment analysis.

*   The 'preprocess' function will remove everything apart from alphabets
from our texts



In [None]:
def preprocess(text):
    processed_text = re.sub('[^a-zA-Z]',' ',text)
    return processed_text



*   Merging "title" and "body" as said before for better accuracy and storing them into a column named "full text".
*   Limiting our total value of words in each text to 500, in order to reduce computational time and storing them in a column named "limited text".
*   Passing the limited texts to preprocess function to get the processed texts which can be fed to our model and storing them in the column named "processed_text"
*   Now, dropping "full text" and "limited text" columns as they serve no further purpose





In [None]:
data['full text']=data.title+" "+data['body'].astype("str")
data['limited text']=data['full text'].apply(lambda x : x[0:500])
data['processed_text']=data['limited text'].apply(lambda x : preprocess(x))
data.drop(['full text','limited text'], axis=1, inplace=True)

Any model can't understand alphabets as input, it only understands numericals. So we have to convert each and every letter to numbers before feeding into model. Tokenizer is the library which can fulfill this task.


*   Created a list with the name "tok" which will be having numerical form of text from each record




In [None]:
tok=[]
for texts in  data['processed_text']:
  tokens = tokenizer.encode(texts, return_tensors='pt')
  tok.append(tokens)

Our model predicts emotion of a given statement by computing 5 unique values. The maximum value's position gives us the sentiment.

*   highest value is in 1st place - highly negative statement
*   highest value is in 2nd place - negative statement
*   highest value is in 3rd place - neutral statement
*   highest value is in 4th place - positive statement
*   highest value is in 5th place - highly positive statement

"result" list will be having these information for each records text




In [None]:
result=[]
for i in range(len(tok)):
  res=model(tok[i])
  result.append(res)

Creating columns with names "most neagtive", "negative", "neutral", "positive", "most positive" in our data frame and adding the sentiment values beneath them.

In [None]:
x=[]
x1=[]
x2=[]
x3=[]
x4=[]
x5=[]
for i in range(len(result)):
  val=result[i].logits[0]
  for values in val:
    x.append(float(values))

for i in range(0,len(x),5):
  x1.append(x[i])
for i in range(1,len(x),5):
  x2.append(x[i])
for i in range(2,len(x),5):
  x3.append(x[i])
for i in range(3,len(x),5):
  x4.append(x[i])
for i in range(4,len(x),5):
  x5.append(x[i])

In [None]:
data['MOST NEGATIVE']=x1
data['NEGATIVE']=x2
data['NEUTRAL']=x3
data['POSITIVE']=x4
data['MOST POSITIVE']=x5

Now, getting the most dominant sentiment among all five columns and storing it in "DOMINANT" column.

In [None]:
data=data.reset_index()
data['DOMINANT']=data[['MOST NEGATIVE','NEGATIVE','NEUTRAL','POSITIVE','MOST POSITIVE']].idxmax(axis=1)

**Relationship between Emotion and texts**

In [None]:
px.histogram(data,x='DOMINANT',template='seaborn')

**Relationship between Emotion and Number of comments**

In [None]:
rel=data.groupby('DOMINANT').sum()
px.bar(x=rel.index,y=rel['comms_num'].values,template='seaborn',labels={'x':'Emotion','y':'Number of comments'})


**10 Most Common Domains Shared In The URL Column**

In [None]:
text=[]
for i in data['url']:
    t=i
    if '/' in t:
        t=t.split('/')[2]
    if 'www.' in t:
        t=t.split('www.')[1]
    if '.com' in t:
        t=t.split('.com')[0]
    text.append(t)
text=pd.DataFrame(columns=['text'],data=text)
s=' '
for i in text['text'].values:
    s+=' '+i
text=text['text'].value_counts()
px.bar(x=text.index[:10],y=text.values[:10],template='seaborn',labels={'x':'Domains','y':'Count'})

Creating a generalised function "sentiment_score" which takes any sentence as input and will return you the sentiment of that sentence in terms of words. However, I have applied this function to our dataframe and again predicted the sentiment of each records.

In [None]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    value= int(torch.argmax(result.logits))+1
    if value == 1 :
        return "Most Negative"
    elif value == 2 :
      return 'Negative'
    elif value == 3:
        return "Neutral"
    elif value==4:
        return "Positive"
    else:
      return 'Most positive'

    print('done')

In [None]:
sentiment_score('I am not sure about pizaas'),sentiment_score('I love you'),sentiment_score('I do not love you'), sentiment_score('I think I am good'),sentiment_score('This is the worst pizza ever')

('Neutral', 'Most positive', 'Most Negative', 'Positive', 'Most Negative')

In [None]:
data['sentiment']=data['processed_text'].apply(lambda x :sentiment_score(x))

Saving a dataframe with only processed text and its sentiments to a csv file with name sentiment_analysis.

In [None]:
df=pd.DataFrame()
df['processed_text']=data.processed_text
df['sentiment']=data.sentiment

In [None]:
df.to_csv('sentiment_analysis.csv', index=False)

In [None]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,655 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 160772 files and directories c

In [None]:
!transformers-cli login

2021-06-03 18:49:04.108252: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0

        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: luka89661@gmail.com
Password: 
Login successful
Your token: VSLXIYpMWzpFFJUHVkPSnQRoHASWXsFErLquDZrHrJOcTncPHcqrdRNGFXlpLZwLddtYHeHCEcfPiGevYziRXDQByUsKNeReZtsfsClBigqagsMUlWGfIUmyPuzlsSMG 

Your token has been saved to /root/.huggingface/tok

In [None]:
!git config --global user.email "luka89661@gmail.com"
!git config --global user.name "orange"

In [None]:
model.push_to_hub("sentiment_analysis")

In [None]:
tokenizer.push_to_hub("sentiment_analysis")