## Import libraries

In [13]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from gensim.models import Word2Vec

## Read tweets

In [14]:
df = pd.read_csv(r'../assets/data/tweets.csv')

tweets = df['text']

tweets

0              RIP #TomVerlaine https://t.co/w40iGeqmKK
1     Reposted from @vintage.cheese #shesjustdrawnth...
2     Reposted from @thehorrorgallery 😂🤣 🔥 \n\nFollo...
3     Reposted from @jess_rene_ #art #witch #witchcr...
4     Reposted from @bjanetrose 🥀 @kostle_laugh_with...
                            ...                        
95    Reposted from @thelegendsofmusic The White Str...
96    Reposted from @paulrosolie  In the last two we...
97    Reposted from @bjanetrose 🥀 #wonderwoman 🥀  ii...
98    Reposted from dropkickblues Chuck Berry, what ...
99    Hell of a crowd in Wellington, NZ tonight, you...
Name: text, Length: 100, dtype: object

## Pre-process tweets

In [15]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

tweetList = []
tweetSentences = []

stopWords = set(stopwords.words('english'))

for i, tweet in enumerate(tweets):
    # Convert the tweet to lowercase using REGEX
    for f in re.findall("([A-Z]+)", tweet):
        tweet = tweet.replace(f, f.lower())

    # Removing special characters and replacing them with a space
    tweet = re.sub("[^A-Za-z0-9]", " ", tweet, 0, re.IGNORECASE)
    
    # From a single sentence, store all the words 
    wordsInTweet = nltk.word_tokenize(tweet)
    
    # Filter out all the stop words
    wordsInTweet = [word for word in wordsInTweet if word not in stopWords]
    
    # Lemmatize each of the tweets
    wordsInTweet = [lemmatizer.lemmatize(word) for word in wordsInTweet]
    
    # Append them to a list
    tweetList.append(wordsInTweet)
    
    # Remake the sentences after removing the stop words
    tweetSentences.append(' '.join(wordsInTweet))

In [16]:
tweetList

[['rip', 'tomverlaine', 'http', 'co', 'w40igeqmkk'],
 ['reposted',
  'vintage',
  'cheese',
  'shesjustdrawnthatway',
  'terrygilliam',
  'montypython',
  'legend',
  '1960s',
  '1970s',
  'vintagecheese',
  'iiii',
  'http',
  'co',
  '2hpfzj1jVM'],
 ['reposted',
  'thehorrorgallery',
  'follow',
  'u',
  'thehorrorgallery',
  'peligrosovalley',
  'iiii',
  'http',
  'co',
  'xkpm5rUwp3'],
 ['reposted',
  'jess',
  'rene',
  'art',
  'witch',
  'witchcraft',
  'instaart',
  'macabre',
  'macabreart',
  'artist',
  'darkart',
  'dark',
  'instagood',
  'salem',
  'joseph',
  'tomanek',
  'philip',
  'hofmanner',
  'iiii',
  'http',
  'co',
  'wvuuumxzll'],
 ['reposted',
  'bjanetrose',
  'kostle',
  'laugh',
  'dead',
  'iiii',
  'http',
  'co',
  'cgv2hmyl1i'],
 ['posted', 'photo', 'http', 'co', 'Vjjloc4scg'],
 ['posted', 'photo', 'http', 'co', '2y6cfktrvw'],
 ['posted', 'photo', 'http', 'co', 'a8nKjpfqho'],
 ['reposted',
  'bizarredoctor',
  'season',
  'witch',
  'sound',
  'artist'

In [17]:
tweetSentences

['rip tomverlaine http co w40igeqmkk',
 'reposted vintage cheese shesjustdrawnthatway terrygilliam montypython legend 1960s 1970s vintagecheese iiii http co 2hpfzj1jVM',
 'reposted thehorrorgallery follow u thehorrorgallery peligrosovalley iiii http co xkpm5rUwp3',
 'reposted jess rene art witch witchcraft instaart macabre macabreart artist darkart dark instagood salem joseph tomanek philip hofmanner iiii http co wvuuumxzll',
 'reposted bjanetrose kostle laugh dead iiii http co cgv2hmyl1i',
 'posted photo http co Vjjloc4scg',
 'posted photo http co 2y6cfktrvw',
 'posted photo http co a8nKjpfqho',
 'reposted bizarredoctor season witch sound artist minelauvart iiii http co jspww6de3k',
 'happy birthday steveadler iiii http co q2hjtdrmsa',
 'reposted patfromstoke1 reposted fakedreams ai iiii http co spfo4fhakc',
 'reposted bjanetrose artist trinitycatdeline iiii http co 9lgrwvg1nj',
 'reposted jayprehistoricpets one absolutely giant komodo dragon seems like living dinosaur run laura iiii 