In [1]:
import numpy as np
import pandas as pd
import re
from string import punctuation                   # to extract the puntuation symbols

from nltk.tokenize import word_tokenize          # to divide strings into tokens
from nltk.stem import WordNetLemmatizer          # to lemmatize the tokens
from nltk.corpus import stopwords                # to remove the stopwords 
#import pos tagger
from nltk import pos_tag

In [4]:
tweets = []
ids = []
def load_tweets(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.split(',')
            #join the tweet from index 1 to the end
            ids.append(int(line[0]))
            line = ','.join(line[1:])
            tweets.append(line.rstrip())
load_tweets('twitter-datasets/test_data.txt')

# Convert to NumPy array to facilitate indexing
tweets = np.array(tweets)

print(f'{len(tweets)} tweets loaded')

10000 tweets loaded


In [5]:
tweets_df = pd.DataFrame({'tweet': tweets},index=ids)

In [6]:
tweets_df.head()

Unnamed: 0,tweet
1,sea doo pro sea scooter ( sports with the port...
2,<user> shucks well i work all week so now i ca...
3,i cant stay away from bug thats my baby
4,<user> no ma'am ! ! ! lol im perfectly fine an...
5,"whenever i fall asleep watching the tv , i alw..."


In [7]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words 

def clean_text(text):
    '''Make text lowercase, remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'<url>', '', text)
    #text = re.sub(r'<user>', '', text)
    text = re.sub('[%s]' % re.escape(punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

tweets_df['clean_tweet'] = tweets_df['tweet'].apply(lambda x: clean_text(x))

In [8]:
tweets_df['tokenized_tweet'] = tweets_df['clean_tweet'].apply(lambda x: word_tokenize(x))
tweets_df['tokenized_tweet_no_stopwords'] = tweets_df['tokenized_tweet'].apply(lambda x: remove_stopwords(x))

In [9]:
#lemmatize with pos tag
def lemmatize_with_pos_tag(tokenized_text):
    lemmatizer = WordNetLemmatizer()
    pos_tagged_text = pos_tag(tokenized_text)
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos=tag[0].lower()) if tag[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(token) for token, tag in pos_tagged_text]
    return lemmatized_tokens
tweets_df['clean_tweet_tokenized_lemmatized'] = tweets_df['tokenized_tweet_no_stopwords'].apply(lambda x: lemmatize_with_pos_tag(x))

In [10]:
drop_columns = ['tweet', 'clean_tweet', 'tokenized_tweet']
#tweets_df = tweets_df.drop(drop_columns, axis=1)

In [11]:
#convert the list of tokens into a string
tweets_df['clean_tweet_tokenized_lemmatized'] = tweets_df['clean_tweet_tokenized_lemmatized'].apply(lambda x: ' '.join(x))
tweets_df['tokenized_tweet'] = tweets_df['tokenized_tweet'].apply(lambda x: ' '.join(x))
tweets_df['tokenized_tweet_no_stopwords'] = tweets_df['tokenized_tweet_no_stopwords'].apply(lambda x: ' '.join(x))


In [12]:
tweets_df.head()

Unnamed: 0,tweet,clean_tweet,tokenized_tweet,tokenized_tweet_no_stopwords,clean_tweet_tokenized_lemmatized
1,sea doo pro sea scooter ( sports with the port...,sea doo pro sea scooter sports with the porta...,sea doo pro sea scooter sports with the portab...,sea doo pro sea scooter sports portable seadoo...,sea doo pro sea scooter sport portable seadoo ...
2,<user> shucks well i work all week so now i ca...,user shucks well i work all week so now i cant...,user shucks well i work all week so now i cant...,user shucks well work week cant come cheer oh ...,user shuck well work week cant come cheer oh p...
3,i cant stay away from bug thats my baby,i cant stay away from bug thats my baby,i cant stay away from bug thats my baby,cant stay away bug thats baby,cant stay away bug thats baby
4,<user> no ma'am ! ! ! lol im perfectly fine an...,user no maam lol im perfectly fine and not ...,user no maam lol im perfectly fine and not con...,user maam lol im perfectly fine contagious any...,user maam lol im perfectly fine contagious any...
5,"whenever i fall asleep watching the tv , i alw...",whenever i fall asleep watching the tv i alwa...,whenever i fall asleep watching the tv i alway...,whenever fall asleep watching tv always wake h...,whenever fall asleep watch tv always wake head...


In [13]:
#rename clean_tweet_tokenized_lemmatized to text
tweets_df['text'] = tweets_df['clean_tweet_tokenized_lemmatized']
tweets_df.drop('clean_tweet_tokenized_lemmatized',axis=1, inplace=True)

In [14]:
#save the data  
tweets_df.to_csv('preprocessed/test_full.csv', index=False)


In [15]:
#check for nan
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 1 to 10000
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   tweet                         10000 non-null  object
 1   clean_tweet                   10000 non-null  object
 2   tokenized_tweet               10000 non-null  object
 3   tokenized_tweet_no_stopwords  10000 non-null  object
 4   text                          10000 non-null  object
dtypes: object(5)
memory usage: 468.8+ KB
