In [1]:
import numpy as np
import pandas as pd
import re
from string import punctuation                   # to extract the puntuation symbols

from nltk.tokenize import word_tokenize          # to divide strings into tokens
from nltk.stem import WordNetLemmatizer          # to lemmatize the tokens
from nltk.corpus import stopwords                # to remove the stopwords 
#import pos tagger
from nltk import pos_tag

In [2]:
tweets = []
def load_tweets(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            tweets.append(line.rstrip())
load_tweets('twitter-datasets/test_data.txt')

# Convert to NumPy array to facilitate indexing
tweets = np.array(tweets)

print(f'{len(tweets)} tweets loaded')

10000 tweets loaded


In [3]:
tweets_df = pd.DataFrame({'tweet': tweets})

In [4]:
tweets_df.head()

Unnamed: 0,tweet
0,"1,sea doo pro sea scooter ( sports with the po..."
1,"2,<user> shucks well i work all week so now i ..."
2,"3,i cant stay away from bug thats my baby"
3,"4,<user> no ma'am ! ! ! lol im perfectly fine ..."
4,"5,whenever i fall asleep watching the tv , i a..."


In [5]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words 

def clean_text(text):
    '''Make text lowercase, remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'<url>', '', text)
    #text = re.sub(r'<user>', '', text)
    text = re.sub('[%s]' % re.escape(punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

tweets_df['clean_tweet'] = tweets_df['tweet'].apply(lambda x: clean_text(x))

In [6]:
tweets_df['clean_tweet'] = tweets_df['clean_tweet'].apply(lambda x: word_tokenize(x))
tweets_df['tokenized_tweet'] = tweets_df['clean_tweet'].apply(lambda x: remove_stopwords(x))

In [7]:
#lemmatize with pos tag
def lemmatize_with_pos_tag(tokenized_text):
    lemmatizer = WordNetLemmatizer()
    pos_tagged_text = pos_tag(tokenized_text)
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos=tag[0].lower()) if tag[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(token) for token, tag in pos_tagged_text]
    return lemmatized_tokens
tweets_df['clean_tweet_tokenized_lemmatized'] = tweets_df['tokenized_tweet'].apply(lambda x: lemmatize_with_pos_tag(x))

In [8]:
drop_columns = ['tweet', 'clean_tweet', 'tokenized_tweet']
#tweets_df = tweets_df.drop(drop_columns, axis=1)

In [9]:
#convert the list of tokens into a string
tweets_df['clean_tweet_tokenized_lemmatized'] = tweets_df['clean_tweet_tokenized_lemmatized'].apply(lambda x: ' '.join(x))

In [10]:
tweets_df.head()

Unnamed: 0,tweet,clean_tweet,tokenized_tweet,clean_tweet_tokenized_lemmatized
0,"1,sea doo pro sea scooter ( sports with the po...","[doo, pro, sea, scooter, sports, with, the, po...","[doo, pro, sea, scooter, sports, portable, sea...",doo pro sea scooter sport portable seadoo seas...
1,"2,<user> shucks well i work all week so now i ...","[shucks, well, i, work, all, week, so, now, i,...","[shucks, well, work, week, cant, come, cheer, ...",shuck well work week cant come cheer oh put ba...
2,"3,i cant stay away from bug thats my baby","[cant, stay, away, from, bug, thats, my, baby]","[cant, stay, away, bug, thats, baby]",cant stay away bug thats baby
3,"4,<user> no ma'am ! ! ! lol im perfectly fine ...","[no, maam, lol, im, perfectly, fine, and, not,...","[maam, lol, im, perfectly, fine, contagious, a...",maam lol im perfectly fine contagious anymore ...
4,"5,whenever i fall asleep watching the tv , i a...","[i, fall, asleep, watching, the, tv, i, always...","[fall, asleep, watching, tv, always, wake, hea...",fall asleep watch tv always wake headache


In [11]:
#rename clean_tweet_tokenized_lemmatized to text
tweets_df['text'] = tweets_df['clean_tweet_tokenized_lemmatized']
tweets_df.drop('clean_tweet_tokenized_lemmatized',axis=1, inplace=True)

In [13]:
#save the data  
tweets_df.to_csv('preprocessed/test_full.csv', index=False)


In [14]:
#check for nan
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tweet            10000 non-null  object
 1   clean_tweet      10000 non-null  object
 2   tokenized_tweet  10000 non-null  object
 3   text             10000 non-null  object
dtypes: object(4)
memory usage: 312.6+ KB
