In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import re
import pandas as pd
import nltk
import nltk
nltk.download('wordnet')
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.tokenize.stanford import StanfordTokenizer 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
data = pd.read_csv('/content/drive/My Drive/Final_Practicum/sentiment140/Sentiment_Analysis_Dataset.csv', encoding="ISO-8859-1")
data.columns = ['label','id' ,' time', 'no_query', 'source','text']
data.head(2)


Unnamed: 0,label,id,time,no_query,source,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


In [0]:
data.head(10)

Unnamed: 0,label,id,time,no_query,source,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
5,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
6,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
7,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
8,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?
9,0,1467812416,Mon Apr 06 22:20:16 PDT 2009,NO_QUERY,erinx3leannexo,spring break in plain city... it's snowing


In [0]:
tidy_tweets=[]
def remove_pattern(text, pattern_regex):
    r = re.findall(pattern_regex, text)
    for i in r:
        text = re.sub(i, '', text)
    
    return text
# We are keeping cleaned tweets in a new column called 'tidy_tweets'
data['tidy_tweets'] = np.vectorize(remove_pattern)(data['text'], "@[\w]* | *RT*")
print(data['tidy_tweets'].head(10))


0    is upset that he can't update his Facebook by ...
1    I dived many times for the ball. Managed to sa...
2      my whole body feels itchy and like its on fire 
3    no, it's not behaving at all. i'm mad. why am ...
4                                  not the whole crew 
5                                          Need a hug 
6    hey  long time no see! Yes..ains a bit ,only a...
7                            nope they didn't have it 
8                                      que me muera ? 
9          spring break in plain city... it's snowing 
Name: tidy_tweets, dtype: object


In [0]:
cleaned_tweets = []

for index, row in data.iterrows():
    # Here we are filtering out all the words that contains link
    words_without_links = [word for word in row.tidy_tweets.split() if 'http' not in word]
    cleaned_tweets.append(' '.join(words_without_links))

data['tidy_tweets'] = cleaned_tweets


In [0]:
data['absolute_tidy_tweets'] = data['tidy_tweets'].str.replace("[^a-zA-Z ]", "")
data.head(10)

Unnamed: 0,label,text,tidy_tweets,absolute_tidy_tweets
0,0,Started using skim milk in my iced coffee A...,Started using skim milk in my iced coffee And ...,Started using skim milk in my iced coffee And ...
1,1,yey just got in on brogÃ¥rn,yey just got in on brogÃ¥rn,yey just got in on brogrn
2,1,@AJSupreme pretty wack?...that shit was beyond...,pretty wack?...that shit was beyond wack lls,pretty wackthat shit was beyond wack lls
3,0,Thinking that if Liverpool waste Â£18million o...,Thinking that if Liverpool waste Â£18million o...,Thinking that if Liverpool waste million on Gl...
4,0,"@thejazzter Sorry about last night, love! Fel...","Sorry about last night, love! Fell asleep!",Sorry about last night love Fell asleep
5,1,"im up, &amp; showered! now i have an hour to d...","im up, &amp; showered! now i have an hour to d...",im up amp showered now i have an hour to do my...
6,1,@curiousmike Maybe until Wednesday?,Maybe until Wednesday?,Maybe until Wednesday
7,0,@Nirjhor i never used lindows,i never used lindows,i never used lindows
8,0,"goodbye to twitter mobile updates, dont need T...","goodbye to twitter mobile updates, dont need T...",goodbye to twitter mobile updates dont need TH...
9,0,@sagittarius88 ican't im not tired &amp; now u...,ican't im not tired &amp; now ur usin dat agai...,icant im not tired amp now ur usin dat against...


In [0]:
def in_dict(word):
    if wordnet.synsets(word):
        #if the word is in the dictionary, we'll return True
        return True

def replace_elongated_word(word):
    regex = r'(\w*)(\w+)\2(\w*)'
    repl = r'\1\2\3'    
    if in_dict(word):
        return word
    new_word = re.sub(regex, repl, word)
    if new_word != word:
        return replace_elongated_word(new_word)
    else:
        return new_word

def detect_elongated_words(row):
    regexrep = r'(\w*)(\w+)(\2)(\w*)'
    words = [''.join(i) for i in re.findall(regexrep, row)]
    for word in words:
        if not in_dict(word):
            row = re.sub(word, replace_elongated_word(word), row)
    return row

In [0]:

data['absolute_tidy_tweets'] = data['absolute_tidy_tweets'].apply(lambda x: detect_elongated_words(x))

In [0]:
lemmatizer = WordNetLemmatizer() 
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [0]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
cleaned_tweets=[]
for index,row in data.iterrows():
    stopWords = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(row["absolute_tidy_tweets"])
    rm_stopwords= [w for w in tokens if w not in stopWords]
    lemmatization=' '.join([lemmatizer.lemmatize(word , get_wordnet_pos(word)) for word in rm_stopwords if word])
    cleaned_tweets.append(lemmatization)
data['cleaned_tweets']=pd.Series(cleaned_tweets)
data.head(10)

Unnamed: 0,label,text,tidy_tweets,absolute_tidy_tweets,cleaned_tweets
0,0,Started using skim milk in my iced coffee A...,Started using skim milk in my iced coffee And ...,Started using skim milk in my iced coffee And ...,Started use skim milk iced coffee And yes real...
1,1,yey just got in on brogÃ¥rn,yey just got in on brogÃ¥rn,yey just got in on brogrn,yey get brogrn
2,1,@AJSupreme pretty wack?...that shit was beyond...,pretty wack?...that shit was beyond wack lls,pretty wackthat shit was beyond wack ls,pretty wackthat shit beyond wack l
3,0,Thinking that if Liverpool waste Â£18million o...,Thinking that if Liverpool waste Â£18million o...,Thinking that if Liverpool waste million on Gl...,Thinking Liverpool waste million Glen Johnson ...
4,0,"@thejazzter Sorry about last night, love! Fel...","Sorry about last night, love! Fell asleep!",Sorry about last night love Fell asleep,Sorry last night love Fell asleep
5,1,"im up, &amp; showered! now i have an hour to d...","im up, &amp; showered! now i have an hour to d...",im up amp showered now i have an hour to do my...,im amp shower hour makeup dry hair amp get dre...
6,1,@curiousmike Maybe until Wednesday?,Maybe until Wednesday?,Maybe until Wednesday,Maybe Wednesday
7,0,@Nirjhor i never used lindows,i never used lindows,i never used lindows,never use lindows
8,0,"goodbye to twitter mobile updates, dont need T...","goodbye to twitter mobile updates, dont need T...",goodbye to twitter mobile updates dont need TH...,goodbye twitter mobile update dont need THAT k...
9,0,@sagittarius88 ican't im not tired &amp; now u...,ican't im not tired &amp; now ur usin dat agai...,icant im not tired amp now ur usin dat against...,icant im tire amp ur usin dat fair


In [0]:
data["label"]= data["label"].replace(4, 1) 
data = data.sample(frac=1).reset_index(drop=True)
data.head(20)

Unnamed: 0,label,text,tidy_tweets,absolute_tidy_tweets,cleaned_tweets
0,1,oops...i meant to say...hope your left arm is ...,oops...i meant to say...hope your left arm is ...,opsi meant to sayhope your left arm is ok drak...,opsi meant sayhope left arm ok drakardnoir
1,0,I want ice cream.,I want ice cream.,I want ice cream,I want ice cream
2,1,I'm happy right now,I'm happy right now,Im happy right now,Im happy right
3,1,@newmoonmovie I contributed to probably 200 of...,I contributed to probably 200 of those. HAHA.,I contributed to probably of those HA,I contribute probably HA
4,1,Heading to practice..tee off 9:27!!! Have grea...,Heading to practice..tee off 9:27!!! Have grea...,Heading to practicete off Have great day ever...,Heading practicete Have great day everyone
5,0,it's saturday night and i have no plans this ...,it's saturday night and i have no plans this h...,its saturday night and i have no plans this ha...,saturday night plan hasnt happen really long time
6,1,@_beck_ Hi Beck! I'm back in Brissy. Weekend w...,"Hi Beck! I'm back in Brissy. Weekend was good,...",Hi Beck Im back in Brisy Weekend was good alth...,Hi Beck Im back Brisy Weekend good although wo...
7,0,My family says my music is too depressing for ...,My family says my music is too depressing for ...,My family says my music is too depressing for ...,My family say music depress beach Haha
8,1,pretty bored. Just thinkin about Haley William...,pretty bored. Just thinkin about Haley William...,pretty bored Just thinkin about Haley Williams...,pretty bore Just thinkin Haley Williams right ...
9,0,@mareyachristina oh well thats good! Mine was ...,oh well thats good! Mine was kinda cold i wish...,oh well thats good Mine was kinda cold i wish ...,oh well thats good Mine kinda cold wish still ...


In [0]:
data.to_csv('/content/drive/My Drive/Final_Practicum/sentiment140/Sentiment_cleaned.csv', header=True, index=False)