In [1]:
import numpy as np
import pandas as pd
import re
from string import punctuation                   # to extract the puntuation symbols

from nltk.tokenize import word_tokenize          # to divide strings into tokens
from nltk.stem import WordNetLemmatizer          # to lemmatize the tokens
from nltk.corpus import stopwords                # to remove the stopwords 
#import pos tagger
from nltk import pos_tag

In [2]:
tweets = []
labels = []

def load_tweets(filename, label):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            tweets.append(line.rstrip())
            labels.append(label)
load_tweets('twitter-datasets/train_neg_full.txt', 0)
load_tweets('twitter-datasets/train_pos_full.txt', 1)

# Convert to NumPy array to facilitate indexing
tweets = np.array(tweets)
labels = np.array(labels)

print(f'{len(tweets)} tweets loaded')

2500000 tweets loaded


In [3]:
tweets_df = pd.DataFrame({'tweet': tweets, 'label': labels})

In [4]:
tweets_df.head()

Unnamed: 0,tweet,label
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0
1,glad i dot have taks tomorrow ! ! #thankful #s...,0
2,1-3 vs celtics in the regular season = were fu...,0
3,<user> i could actually kill that girl i'm so ...,0
4,<user> <user> <user> i find that very hard to ...,0


In [5]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words 

def clean_text(text):
    '''Make text lowercase, remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'<url>', '', text)
    #text = re.sub(r'<user>', '', text)
    #text = re.sub('[%s]' % re.escape(punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def remove_punctuation(text):
    '''Make text lowercase, remove links,remove punctuation
    and remove words containing numbers.'''
    text = re.sub('[%s]' % re.escape(punctuation), '', text)
    return text

tweets_df['partial_clean_tweet'] = tweets_df['tweet'].apply(lambda x: clean_text(x))


In [6]:
tweets_df['clean_tweet'] = tweets_df['partial_clean_tweet'].apply(lambda x: remove_punctuation(x))

In [7]:
tweets_df['tokenized_tweet'] = tweets_df['clean_tweet'].apply(lambda x: word_tokenize(x))
tweets_df['tokenized_tweet_no_stopwords'] = tweets_df['tokenized_tweet'].apply(lambda x: remove_stopwords(x))

In [8]:
#lemmatize with pos tag
def lemmatize_with_pos_tag(tokenized_text):
    lemmatizer = WordNetLemmatizer()
    pos_tagged_text = pos_tag(tokenized_text)
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos=tag[0].lower()) if tag[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(token) for token, tag in pos_tagged_text]
    return lemmatized_tokens
tweets_df['clean_tweet_tokenized_lemmatized'] = tweets_df['tokenized_tweet_no_stopwords'].apply(lambda x: lemmatize_with_pos_tag(x))

In [9]:
drop_columns = ['tweet', 'clean_tweet', 'tokenized_tweet']
#tweets_df = tweets_df.drop(drop_columns, axis=1)

In [10]:
#convert the list of tokens into a string
tweets_df['clean_tweet_tokenized_lemmatized'] = tweets_df['clean_tweet_tokenized_lemmatized'].apply(lambda x: ' '.join(x))
tweets_df['tokenized_tweet'] = tweets_df['tokenized_tweet'].apply(lambda x: ' '.join(x))
tweets_df['tokenized_tweet_no_stopwords'] = tweets_df['tokenized_tweet_no_stopwords'].apply(lambda x: ' '.join(x))


In [11]:
tweets_df.head()

Unnamed: 0,tweet,label,partial_clean_tweet,clean_tweet,tokenized_tweet,tokenized_tweet_no_stopwords,clean_tweet_tokenized_lemmatized
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...,0,vinco tresorpack ( difficulty of object : d...,vinco tresorpack difficulty of object dis...,vinco tresorpack difficulty of object disassem...,vinco tresorpack difficulty object disassemble...,vinco tresorpack difficulty object disassemble...
1,glad i dot have taks tomorrow ! ! #thankful #s...,0,glad i dot have taks tomorrow ! ! #thankful #s...,glad i dot have taks tomorrow thankful startho,glad i dot have taks tomorrow thankful startho,glad dot taks tomorrow thankful startho,glad dot taks tomorrow thankful startho
2,1-3 vs celtics in the regular season = were fu...,0,- vs celtics in the regular season = were fuck...,vs celtics in the regular season were fucked...,vs celtics in the regular season were fucked i...,vs celtics regular season fucked play playoffs,v celtic regular season fuck play playoff
3,<user> i could actually kill that girl i'm so ...,0,<user> i could actually kill that girl i'm so ...,user i could actually kill that girl im so sor...,user i could actually kill that girl im so sorry,user could actually kill girl im sorry,user could actually kill girl im sorry
4,<user> <user> <user> i find that very hard to ...,0,<user> <user> <user> i find that very hard to ...,user user user i find that very hard to believ...,user user user i find that very hard to believ...,user user user find hard believe im afraid,user user user find hard believe im afraid


In [12]:
#rename clean_tweet_tokenized_lemmatized to text
tweets_df['text'] = tweets_df['clean_tweet_tokenized_lemmatized']
tweets_df.drop('clean_tweet_tokenized_lemmatized',axis=1, inplace=True)

In [13]:
#separate the data based on label
positive_tweets = tweets_df[tweets_df['label'] == 1]
negative_tweets = tweets_df[tweets_df['label'] == 0]

In [14]:
#save the data  
positive_tweets.to_csv('preprocessed/train_pos_full.csv', index=False)
negative_tweets.to_csv('preprocessed/train_neg_full.csv', index=False)
tweets_df.to_csv('preprocessed/train_full.csv', index=False)


In [15]:
#check for nan
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500000 entries, 0 to 2499999
Data columns (total 7 columns):
 #   Column                        Dtype 
---  ------                        ----- 
 0   tweet                         object
 1   label                         int64 
 2   partial_clean_tweet           object
 3   clean_tweet                   object
 4   tokenized_tweet               object
 5   tokenized_tweet_no_stopwords  object
 6   text                          object
dtypes: int64(1), object(6)
memory usage: 133.5+ MB
