In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import random
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
labeled_document = "/content/drive/MyDrive/Colab Notebooks/twitter-datasets/train_labeled.txt"
pos_file_path = "/content/drive/MyDrive/Colab Notebooks/twitter-datasets/train_pos.txt"
neg_file_path = "/content/drive/MyDrive/Colab Notebooks/twitter-datasets/train_neg.txt"
pos_file_path_prep = "/content/drive/MyDrive/Colab Notebooks/twitter-datasets/train_pos_prep.txt"
neg_file_path_prep = "/content/drive/MyDrive/Colab Notebooks/twitter-datasets/train_neg_prep.txt"
test_file_path = "/content/drive/MyDrive/Colab Notebooks/twitter-datasets/test_data.txt"
test_file_path_prep = "/content/drive/MyDrive/Colab Notebooks/twitter-datasets/test_data_prep.txt"

In [5]:
def rewrite_abbreviations(line):
    line = line.replace('can\'t', 'cannot')
    line = line.replace('ain\'t', 'is not')
    line = line.replace('n\'t', ' not')  
    line = line.replace('i\'m', 'i am') 
    line = line.replace('\'re', ' are') 
    line = line.replace('it\'s', 'it is') 
    line = line.replace('that\'s', 'that is') 
    line = line.replace('\'ll', ' will') 
    line = line.replace('\'ve', ' have')
    line = line.replace('\'d', ' would')
    line = line.replace('he\'s', 'he is') 
    line = line.replace('she\'s', 'she is')     
    line = line.replace('what\'s', 'what is')  
    line = line.replace('who\'s', 'who is') 
    line = line.replace('thanx', 'thanks')  
    line = line.replace('&', 'and')
    line = line.replace('wont', 'will not')
    line = line.replace('wasnt', 'was not')
    line = line.replace('isnt', 'is not')
    line = line.replace('werent', 'were not')
    line = line.replace('didnt', 'did not')
    line = line.replace('couldnt', 'could not')
    line = line.replace('shouldnt', 'should not')
    line = line.replace('wouldnt', 'would not')
    line = line.replace('havent', 'have not')
    line = line.replace('youre', 'you are')
    line = line.replace('youve', 'you have')
    line = line.replace('theyre', 'they are')
    line = line.replace('theyve', 'they have')
    line = line.replace('dunno', 'do not know')
    line = line.replace('youll', 'you will')
    line = line.replace('theyll', 'they will')
    return line

def remove_hashtags(line):
    line = re.sub("@[A-Za-z0-9_]+", '', line)
    line = re.sub("#[A-Za-z0-9_]+", '', line)
    return line

def replace_numbers(line):
    line = re.sub('[0-9]{5,}', 'numhuge', line)
    line = re.sub('[0-9]{4}', 'numlarge', line)
    line = re.sub('[0-9]{3}', 'nummedium', line)
    line = re.sub('[0-9]{2}', 'numsmall', line)
    return line

def remove_stop_word(line):
    stop_words = set(['<user>', '<url>'])
    for word in stop_words:
      line = line.replace(word, '')
    return line
    
def remove_punctuations(line):
    punctuations = set([
        '\"', '$', '%', '&', '\\', '\'', '(', ')', '*', '+',
        ',', '-', '.', '/', '^', '_', '`', '{', '|', ':', 
        ';', '<', '=', '>', '@', '[', ']', '}', '~', '#'])
 #   res=" "
 #   for ele in line:
 #     if ele not in punctuations:
 #       res+=ele
 #   return res
  
    for word in punctuations:
      line = line.replace(word, '')
    return line

def replace_emoticons(line):
    emoticons_good = set([
        ':-)', ':)', ';)', ':o)', ':]', ':3', '<3', ':c)', ':>', '=]', '8)', '=)', ':}',
        ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
        '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
        'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
        '<3', 'xo', 'lol', 'hahaha', 'haha'])
    emoticons_bad = set([
        ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
        ':-[', ':-<', '=\\', '=/', ':/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
        ':c', ':{', '>:\\', ';(']) 
    for word in emoticons_good:
        line = line.replace(word, 'good')
    for word in emoticons_bad:    
        line = line.replace(word, 'bad')
    return line

def remove_spaces(line):
    line = re.sub(' +', ' ', line)
    line = line.lstrip()
    return line

def stemming_words(line):
    porter_stemmer = PorterStemmer()
    spl = line.split()
    new = []
    for word in spl:
        new.append(porter_stemmer.stem(word))
    line = " ".join(new)
    return line

In [6]:
def preprocessing(line):
    line = replace_emoticons(line)
    line = rewrite_abbreviations(line)
    line = remove_hashtags(line)
    line = replace_numbers(line)        
    line = remove_stop_word(line)
    line = remove_punctuations(line)
    line = remove_spaces(line)
    line = stemming_words(line)
    return line

In [7]:
with open(pos_file_path, encoding='utf8') as f, open(pos_file_path_prep, "w+", encoding='utf-8') as fout:
    for line in f:
        line = preprocessing(line)
        fout.write(line + '\n')

In [8]:
with open(neg_file_path, encoding='utf8') as f, open(neg_file_path_prep, "w+", encoding='utf-8') as fout:
    for line in f:
        line = preprocessing(line)              
        fout.write(line + '\n')
      

In [9]:
with open(test_file_path, encoding='utf8') as f, open(test_file_path_prep, "w+", encoding='utf-8') as fout:
    for line in f:        
        line = line.partition(',')[2]             
        line = preprocessing(line)
        if line == "": 
          fout.write("nothing \n")     
        else:   
           fout.write(line + '\n')

In [None]:
# print preprocessed dataset and debug
with open(pos_file_path_prep, encoding='utf8') as f:
    content = f.readlines()
data = [x.strip() for x in content]
data = pd.DataFrame(data)
data