### 1. Importing the libraries.

In [1]:
import pandas as pd
import re #regular expression
from textblob import TextBlob
import string
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arpit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 2. Defining Emoticons and Emojis.
A tweet might contain some emoticons and emojis, therefore they have to be removed before tweets are processed.

In [2]:
#HappyEmoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

In [3]:
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

In [4]:
#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

In [5]:
#combine sad and happy emoticons
emoticons = emoticons_happy.union(emoticons_sad)

### 3. Define a function clean_tweets for cleaning the unnecessary words.

In [6]:
def clean_tweets(tweet):
 
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)
#after tweepy preprocessing the colon symbol left remain after      #removing mentions
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
#replace consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
#remove emojis from tweet
    tweet = emoji_pattern.sub(r'', tweet)
#filter using NLTK library append it to a string
    filtered_tweet = [w for w in word_tokens if not w in stop_words]
    filtered_tweet = []
#looping through conditions
    for w in word_tokens:
#check tokens against stop words , emoticons and punctuations
        if w not in stop_words and w not in emoticons and w not in string.punctuation:
            filtered_tweet.append(w)
    return ' '.join(filtered_tweet)
    #print(word_tokens)
    #print(filtered_sentence)return tweet

In [7]:
COLS = ['polarity', 'subjectivity']

### 4. Define a function analysis of analysing tweets and finding the sentiment.

In [8]:
def analysis(inputfile, outputfile):
    
    indf = pd.read_csv(inputfile)
    if os.path.exists(outputfile):
        outdf = pd.read_csv(outputfile, header=0)
    else:
        outdf = pd.DataFrame(columns=COLS)
    
    for tweet in indf['original_text']:
        new_entry = []
        filtered_tweet=clean_tweets(tweet)
        blob = TextBlob(filtered_tweet)
        Sentiment = blob.sentiment     
        polarity = Sentiment.polarity
        subjectivity = Sentiment.subjectivity
        new_entry += [polarity,subjectivity]
        
        single_tweet_df = pd.DataFrame([new_entry], columns=COLS)
        outdf = outdf.append(single_tweet_df, ignore_index=True)
    
    outdf['created_at']=indf['created_at']
    outdf['location']=indf['place_coord_boundaries']
    
    csvFile = open(outputfile, 'a' ,encoding='utf-8')
    outdf.to_csv(csvFile, index=False)

### 5. Calling the function analysis.
This will give a csv file containing the Geo-location and Sentiment of tweets from that locatin.

In [9]:
analysis(inputfile = 'angreziMedium.csv', outputfile = 'angrezimediumanalyze.csv')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
