# PRE-PROCESSING

In [1]:
import pandas as pd
import json
import re
import string
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
raw_tweets_text=[]

def get_data(url):
    with open(url) as fp:
        for line in fp:
                if(line!="\n"):
                    tweet_as_dict=json.loads(line)
                    raw_tweets_text.append(tweet_as_dict['text'])
    convert_to_csv()

def convert_to_csv():    
    df=pd.DataFrame(raw_tweets_text)
    df.to_csv('tweets.csv')

In [3]:
get_data('tweets_file.txt')

In [4]:
d = pd.read_csv("tweets.csv")
d = d.drop("Unnamed: 0",axis=1)
d = d.rename(columns={"0":"Text"})
d.drop_duplicates(inplace=True)
d = d.reset_index(drop=True)
d.to_csv('Tweets.csv')
d.head(10)

Unnamed: 0,Text
0,RT @sculpturejay: 🐆should we talk about i-land...
1,RT @abetokhi: Yes suicide is haram but making\...
2,"RT @geezybeatz145__: I am stressed, I am depre..."
3,RT @MillsReggie: always learn how to be strong...
4,@CATARllNA Should I add another one then? 🤔
5,RT @evalution_music: If you wanna see me at ba...
6,This is the underside of my world.Of course yo...
7,RT @empresslexiii: I dont care if you are work...
8,RT @Bryan62784488: Within the New York State P...
9,I'm literally tired


In [5]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6941 entries, 0 to 6940
Data columns (total 1 columns):
Text    6941 non-null object
dtypes: object(1)
memory usage: 54.4+ KB


In [6]:
def process_data(df):
    text_emoji = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
    text_links = re.compile('httpstco[a-zA-Z0-9]+')
    text_rts = re.compile('RT [A-Za-z0-9]+')
    text_schar = re.compile('[^A-Za-z0-9 ]+')
    text_nums = re.compile('[0-9]+')
    
    for i in range(len(df)):
        txt=df["Text"].iloc[i]
        txt=text_emoji.sub(r'',txt)
        temp=txt.translate(txt.maketrans('','',string.punctuation))
        temp=text_links.sub(r'',temp)
        temp=text_rts.sub(r'',temp)        
        temp=text_schar.sub(r'',temp)
        temp=text_nums.sub(r'',temp)
        temp = unicodedata.normalize('NFKD',temp).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        df["Text"].iloc[i] = temp.lower()
    return df   

In [8]:
df = process_data(d.copy())
df.head(10)

Unnamed: 0,Text
0,should we talk about ilandthere are surprisin...
1,yes suicide is haram but makingsomeones life ...
2,i am stressed i am depressed here we go again...
3,always learn how to be strong alone
4,catarllna should i add another one then
5,if you wanna see me at bass canyon or lost la...
6,this is the underside of my worldof course you...
7,i dont care if you are working i dont care if...
8,within the new york state park police alone ...
9,im literally tired


In [9]:
def further_ops(df):
    for i in range(len(df)):
        tweet = df["Text"].iloc[i]
        tweet = contractions.fix(tweet)
        token = word_tokenize(tweet)
        token_no_sw = [word for word in token if word not in stopwords.words()]
        lemmatizer = WordNetLemmatizer()
        lemm_token = [lemmatizer.lemmatize(word) for word in token_no_sw]
        df["Text"].iloc[i] = " ".join(lemm_token)
    return df    

In [10]:
dft = further_ops(df.copy())
dft.head(10)

Unnamed: 0,Text
0,talk ilandthere surprisingly lot people like i...
1,yes suicide haram makingsomeones life miserabl...
2,stress depress go need somebody talk
3,always learn strong alone
4,catarllna add another
5,wan see bass canyon lose land fill survey thin...
6,underside worldof course dont stupid bles iiiii
7,dont work dont mall brain fill poison edg
8,within new york state park police alone office...
9,literally tire


In [11]:
index = df[df["Text"]==""].index
df.drop(index,inplace=True)
df = df.reset_index(drop="True")

In [13]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6294 entries, 0 to 6293
Data columns (total 1 columns):
Text    6294 non-null object
dtypes: object(1)
memory usage: 49.3+ KB


In [14]:
d.to_csv('Processed_tweets.csv')