In [18]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from textblob import Word, TextBlob
from nltk.stem.porter import PorterStemmer

In [2]:
df = pd.read_csv('train_dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
columns = ['id']
df.drop(columns, axis = 1, inplace=True)

In [5]:
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [6]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [7]:
def removingstopwords(tweet):
  tweet.replace('[^\w\s]', '')
  tweet = " ".join(word for word in tweet.split() if word not in stop_words)
  tweet = " ".join(Word(word).lemmatize() for word in tweet.split())
  return tweet


In [8]:
df.tweet = df.tweet.apply(lambda x : removingstopwords(str(x)))

In [9]:
df.head(20)

Unnamed: 0,label,tweet
0,0,@user father dysfunctional selfish drag kid dy...
1,0,@user @user thanks #lyft credit can't use caus...
2,0,bihday majesty
3,0,#model love u take u time urð±!!! ððð...
4,0,factsguide: society #motivation
5,0,[2/2] huge fan fare big talking leave. chaos p...
6,0,@user camping tomorrow @user @user @user @user...
7,0,next school year year exams.ð¯ can't think ð...
8,0,won!!! love land!!! #allin #cavs #champions #c...
9,0,@user @user welcome ! i'm #gr8 !


In [10]:
def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word

In [11]:
def preprocess_tweet(tweet):
  tweet = tweet.strip('\'"?!,.():;/')
  tweet = re.sub(r'(.)\1+', r'\1\1', tweet)
  tweet = re.sub(r'(-|\')', '', tweet)
  return tweet

In [12]:
def is_valid_tweet(tweet):
    # Check if word begins with an alphabet
    tweet = re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', tweet) is not None
    return tweet

In [13]:
def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet

In [21]:
def cleaning_tweet(tweet):
    processed_tweet = []
    # Convert to lower case
    tweet = tweet.lower()
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.split()

    for tweet in words:
      tweet = preprocess_tweet(tweet)
      if is_valid_tweet(tweet):
          processed_tweet.append(tweet)

    return ' '.join(processed_tweet)

In [22]:
df.tweet = df.tweet.apply(lambda x : cleaning_tweet(str(x)))

In [23]:
df.head(20)

Unnamed: 0,label,tweet
0,0,USER_MENTION father dysfunctional selfish drag...
1,0,USER_MENTION USER_MENTION thanks lyft credit c...
2,0,bihday majesty
3,0,model love u take u time
4,0,factsguide society motivation
5,0,huge fan fare big talking leave chaos pay disp...
6,0,USER_MENTION camping tomorrow USER_MENTION USE...
7,0,next school year year cant think school exams ...
8,0,won love land allin cavs champions cleveland c...
9,0,USER_MENTION USER_MENTION welcome im gr8
