# NLP/Sentiment Analysis

### Import Dependencies

In [38]:
from textblob import TextBlob
import pandas as pd
import random
# from collections import Counter
# import matplotlib.pyplot as plt

### Connecting with MongoDB

In [39]:
# Importing dependencies to import datasets from MongoDB
from pymongo import MongoClient
import os

In [40]:
# Creating a connection with MongoDB
client=MongoClient('localhost', 27017)

In [41]:
# Providing list of datasets for US Elections Twitter data
db=client.us_election_twitter
collect_names=db.list_collection_names()
collect_names

['romney3_12_df',
 'mccain1_08_df',
 'romney1_12_df',
 'barack1_08_df',
 'trump3_20_df',
 'trump2_20_df',
 'trump1_20_df',
 'trump1_16_df',
 'trump2_16_df',
 'hillary2_16_df',
 'romney2_12_df',
 'biden1_20_df',
 'biden2_20_df',
 'barack3_12_df',
 'biden3_20_df',
 'mccain3_08_df',
 'barack3_08_df',
 'mccain2_08_df',
 'hillary1_16_df',
 'barack2_12_df',
 'trump3_16_df',
 'barack1_12_df',
 'barack2_08_df',
 'hillary3_16_df']

### Importing Collections from MongoDB

In [58]:
data=db.biden2_20_df
h_list=data.find()
biden2_20=pd.DataFrame(list(data.find()))

In [59]:
data=db.trump2_20_df
h_list=data.find()
trump2_20=pd.DataFrame(list(data.find()))

In [60]:
# Keeping the following columns: 'tweet', 'replies_count', 'retweets_count', 'like_count'
biden2_20=pd.DataFrame(biden2_20, columns=['tweet','replies_count','retweets_count','likes_count'])
trump2_20=pd.DataFrame(trump2_20, columns=['tweet','replies_count','retweets_count','likes_count'])
biden2_20['candidate']='Biden' # Adding separate column to identify candidate
trump2_20['candidate']='Trump' # Adding separate column to identify candidate

### Random Samples of Datasets

In [93]:
# # As this was a big data project we decided to choose random samples of 30 percent for each 
# # dataset to facilitate quicker analysis and lower the probability of slow machine performance
# biden3_20=biden3_20.sample(frac = 0.3)
# trump3_20=trump3_20.sample(frac = 0.3)

### Preprocessing of Data

### Create Funciton to Clean Tweets

In [61]:
# Import Dependency for RegEx
import re
# import emoji 

def cleantweet(text):
    text=re.sub(r'@[A-Za-z0-9_]+', '', text) # This removes @ mentions
    text=re.sub(r'https?:\/\/\S+', '', text) # This removes the hyperlinks
    text=re.sub(r'#[A-Za-z0-9_]+', '', text) # This removes the hashtag mentions
    text=re.sub(r'\W', ' ', text) # This removes all special characters
    text=re.sub(r'[^\x00-\x7F]+',' ', text) # This replaces non-ASCII characters with space
    text=re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # Removing all single characters left as a result of removing all special characters
    text=re.sub(r'\^[a-zA-Z]\s+', ' ', text) # Removing all single characters from the start
    text=re.sub(r'_', '', text) # This removes underscore symbols
    text=re.sub(r'\s+', ' ', text, flags=re.I) # Replacing multiple spaces with single spaces
#     text = text.lower() # Converts all text to lowercase
#     text=emoji.get_emoji_regexp().sub(u'',text) # This removes the emojis
#     text=re.sub('\n', ' ', text) # This removes linebreaks
    return text

### Creating Function to Drop Empty Tweets

In [62]:
import numpy as np
def drop_empty_tweets(db):
    db=db.replace(r'^\s*$', np.nan, regex=True) # Replace empty cells with NaN value
    return db

### Cleaning tweets, dropping rows with NaN values and Creating New Column

In [63]:
# Applying function to clean tweets and drop empty tweets for Biden
biden2_20['tweet']=biden2_20['tweet'].apply(cleantweet)
biden2_20=biden2_20.apply(drop_empty_tweets)

In [64]:
# Applying function to clean tweets and drop empty tweets for Trump
trump2_20['tweet']=trump2_20['tweet'].apply(cleantweet)
trump2_20=trump2_20.apply(drop_empty_tweets)

### Dropping NaN values

In [65]:
biden2=biden2_20.dropna()
trump2=trump2_20.dropna()

### Creating Length of Tweets Feature

In [68]:
biden2['tweet'] = biden2['tweet'].astype(str) # Converting tweet column to string before splitting
biden2['tweet_length']= biden2['tweet'].str.split().str.len() # Creating separate column with length of tweets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [70]:
trump2['tweet'] = trump2['tweet'].astype(str) # Converting tweet column to string before splitting
trump2['tweet_length']= trump2['tweet'].str.split().str.len() # Creating separate column with length of tweets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Joining both Biden and Trump Datasets

In [71]:
# Joining both Biden and Trump Datasets
second_debate=pd.concat([biden2,trump2])
second_debate.count()

tweet             995049
replies_count     995049
retweets_count    995049
likes_count       995049
candidate         995049
tweet_length      995049
dtype: int64

In [72]:
# Create function to obtain subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create function to obtain polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

# Create Columns for Subjectivity and Polarity
second_debate['Subjectivity']=second_debate['tweet'].apply(getSubjectivity)
second_debate['Polarity']=second_debate['tweet'].apply(getPolarity)
second_debate.head(60)

Unnamed: 0,tweet,replies_count,retweets_count,likes_count,candidate,tweet_length,Subjectivity,Polarity
0,Hypothetical Question Let say Joe Biden wins t...,0,0,0,Biden,48,0.2875,0.215625
1,want the Healthcare fixed,0,0,0,Biden,4,0.2,0.1
2,Joe Biden and Kamala Harris will not let Donal...,0,1,1,Biden,25,0.383333,-0.2
3,Apparently thinks being compared to Mister Rog...,0,0,0,Biden,17,0.305,0.155
4,And Joe Biden doesn break the law mon Donkey ...,0,0,3,Biden,10,0.0,0.0
5,Ofcourse But it will literally always be like...,1,0,0,Biden,20,0.583333,-0.1
6,Sure it could We don even know how much debt ...,1,0,0,Biden,36,0.62037,0.05555556
7,Texting is easy if you re not comfortable with...,0,0,0,Biden,11,0.816667,0.1166667
8,All m saying is not everyone can be in manage...,1,1,2,Biden,53,0.35,0.2
9,Lock up He a liar fraud criminal,0,0,0,Biden,7,0.55,-0.4


In [73]:
# Obtaining Polarity Analysis
def getPolarityAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
    
second_debate['Sentiment']=second_debate['Polarity'].apply(getPolarityAnalysis)
second_debate.head(60)

Unnamed: 0,tweet,replies_count,retweets_count,likes_count,candidate,tweet_length,Subjectivity,Polarity,Sentiment
0,Hypothetical Question Let say Joe Biden wins t...,0,0,0,Biden,48,0.2875,0.215625,Positive
1,want the Healthcare fixed,0,0,0,Biden,4,0.2,0.1,Positive
2,Joe Biden and Kamala Harris will not let Donal...,0,1,1,Biden,25,0.383333,-0.2,Negative
3,Apparently thinks being compared to Mister Rog...,0,0,0,Biden,17,0.305,0.155,Positive
4,And Joe Biden doesn break the law mon Donkey ...,0,0,3,Biden,10,0.0,0.0,Neutral
5,Ofcourse But it will literally always be like...,1,0,0,Biden,20,0.583333,-0.1,Negative
6,Sure it could We don even know how much debt ...,1,0,0,Biden,36,0.62037,0.05555556,Positive
7,Texting is easy if you re not comfortable with...,0,0,0,Biden,11,0.816667,0.1166667,Positive
8,All m saying is not everyone can be in manage...,1,1,2,Biden,53,0.35,0.2,Positive
9,Lock up He a liar fraud criminal,0,0,0,Biden,7,0.55,-0.4,Negative


### Convert ML_Data to CSV

In [74]:
second_debate.to_csv(r'C:\Users\Greg\Documents\Analysis_Projects\US_Election_NLP\US_Election_NLP\Concat_ML_Data\second_debate.csv', index = False, header=True)