In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import re

In [115]:
def clean_words(content,doc_bool):
    '''
    Drop stop words form tweets when model is not doc2vec
    Case-folding
    Remove len < 3 if alpha word
    :return: a clean word list
    '''
    if doc_bool == True:
        clean = [word.lower() for word in content if not (word.isalpha() and len(word) <= 2)]
        return ' '.join(clean)
    else:
        stops = set(stopwords.words("english"))
        clean = [word.lower() for word in content if not (word.isalpha() and len(word) <= 2) and not word in stops]
        return ' '.join(clean)


def cut_words(content):
    '''
    Tokenize the tweet to words
    :return: a cut word list
    '''
    if content != '' and content is not None:
        seg_list = word_tokenize(content)
        each_split = ' '.join(seg_list).split(' ')
    return each_split

def get_data_df(file_list):
    '''
    Saving all data into one dataframe
    '''
    data = []
    with open(file_list[0]) as f:
        for line in f:
            doc = line.replace("\t", "").replace("<user>", "").replace("\n", "").replace("<url>", "")
            data.append([doc,1])
    with open(file_list[1]) as f:
        for line in f:
            doc = line.replace("\t", "").replace("<user>", "").replace("\n", "").replace("<url>", "")
            data.append([doc,0])

    df = pd.DataFrame (data, columns = ['tweet', 'label'])
    
    return df

In [116]:
df = get_data_df(["twitter-datasets/train_pos.txt","twitter-datasets/train_neg.txt"])
df

Unnamed: 0,tweet,label
0,i dunno justin read my mention or not . only ...,1
1,"because your logic is so dumb , i won't even c...",1
2,""" just put casper in a box ! "" looved the bat...",1
3,thanks sir > > don't trip lil mama ... just ...,1
4,visiting my brother tmr is the bestest birthda...,1
...,...,...
199995,can't wait to fake tan tonight ! hate being pale,0
199996,darling i lost my internet connection .. and ...,0
199997,kanguru defender basic 4 gb usb 2.0 flash driv...,0
199998,rizan is sad now,0


In [117]:
temp = df['tweet'].apply(lambda x : cut_words(x))
df['tweet'] = temp.apply(lambda x : clean_words(x, True))
df.to_csv("part.csv",index = False)

In [118]:
df_pos = df[df['label'] == 1].copy()
df_pos.to_csv("part_pos.csv",index = False)
df_neg = df[df['label'] == 0].copy()
df_neg.to_csv("part_neg.csv",index = False)

In [119]:
data = []
with open("twitter-datasets/test_data.txt") as f:
    for line in f:
        line = ','.join(line.split(',')[1:])
        doc = line.replace("\t", "").replace("<user>", "").replace("\n", "").replace("<url>", "")
        data.append([doc])
df_test = pd.DataFrame (data, columns = ['tweet'])
df_test

Unnamed: 0,tweet
0,sea doo pro sea scooter ( sports with the port...
1,shucks well i work all week so now i can't co...
2,i cant stay away from bug thats my baby
3,no ma'am ! ! ! lol im perfectly fine and not ...
4,"whenever i fall asleep watching the tv , i alw..."
...,...
9995,had a nice time w / my friend lastnite
9996,no it's not ! please stop !
9997,not without my daughter ( dvd two-time oscar (...
9998,have fun in class sweetcheeks


In [120]:
temp = df_test['tweet'].apply(lambda x : cut_words(x))
df_test['tweet'] = temp.apply(lambda x : clean_words(x, True))
df_test.to_csv("test.csv",index = False)