# Connecting with google drive

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


# Connecting with Kaggle

In [None]:
# install kaggle
!pip install -q kaggle

In [None]:
# upload kaggle.json file
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"arbazkhancs","key":"140450ae4ad01edb47d18c912d681ad3"}'}

In [None]:
# create a kaggle folder
!mkdir ~/.kaggle

In [None]:
# copy the kaggle.json file to above folder
!cp kaggle.json ~/.kaggle/

In [None]:
# permission for the json to act
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# to list kaggle dataset
! kaggle datasets list

ref                                                      title                                            size  lastUpdated          downloadCount  voteCount  usabilityRating  
-------------------------------------------------------  ----------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
syedanwarafridi/vehicle-sales-data                       Vehicle Sales Data                               19MB  2024-02-21 20:16:17           5368         89  1.0              
devi5723/e-commerce-cosmetics-dataset                    E-commerce Cosmetic Products                      1MB  2024-02-28 14:46:12            744         24  0.9411765        
tarunrm09/climate-change-indicators                      Climate change Indicators                        34KB  2024-02-22 08:53:54           2511         60  1.0              
raphaelmanayon/temperature-and-ice-cream-sales           Temperature and Ice Cream Sales                   1KB  202

# Download the dataset

In [None]:
! kaggle datasets download -d hariharasudhanas/twitter-emoji-prediction

Downloading twitter-emoji-prediction.zip to /content
  0% 0.00/3.50M [00:00<?, ?B/s]
100% 3.50M/3.50M [00:00<00:00, 177MB/s]


In [None]:
# unzip the downloaded file
! unzip twitter-emoji-prediction.zip

Archive:  twitter-emoji-prediction.zip
  inflating: Mapping.csv             
  inflating: OutputFormat.csv        
  inflating: Test.csv                
  inflating: Train.csv               


# Text Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Import Dataframe

In [None]:
df = pd.read_csv("Train.csv")

In [None]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
df.head()

Unnamed: 0,TEXT,Label
0,Vacation wasted ! #vacation2017 #photobomb #ti...,0
1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",1
2,Been friends since 7th grade. Look at us now w...,2
3,This is what it looks like when someone loves ...,3
4,RT @user this white family was invited to a Bl...,3


## Information about the dataframe

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   TEXT    70000 non-null  object
 1   Label   70000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


In [None]:
df.shape

(70000, 2)

In [None]:
df.isnull().sum()

TEXT     0
Label    0
dtype: int64

In [None]:
df.describe()

Unnamed: 0,Label
count,70000.0
mean,8.646657
std,5.209129
min,0.0
25%,3.0
50%,9.0
75%,13.0
max,19.0


In [None]:
df["TEXT"].unique()

array(['Vacation wasted ! #vacation2017 #photobomb #tired #vacationwasted #mcgar30 #miami @ Port of…\n',
       'Oh Wynwood, you’re so funny! : @user #Wynwood #Art #ItWas3AM #Flowers #Vibes @ Wynwood…\n',
       'Been friends since 7th grade. Look at us now we all following our dreams doing what we love and…\n',
       ...,
       "If one of my daughters is wearing this and asks me for ice cream at 2am.... I won't say no. …\n",
       'Guess who whoop people on THEIR homecoming?! #asurams @ Atlanta, Georgia\n',
       'We Love you Robbie @ Heritage Memorial Cemetery LLC\n'],
      dtype=object)

In [None]:
df["Label"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

here we have 19 different emojis

## Convert dataframe to list

In [None]:
def get_tweets_emojiId(df, columns):
  tweets = [i for i in df[columns[0]]]
  emojiID = list(df[columns[1]])
  return tweets, emojiID

In [None]:
tweets, emojiID = get_tweets_emojiId(df, ["TEXT", "Label"])

In [None]:
tweets[:5], emojiID[:5]

(['Vacation wasted ! #vacation2017 #photobomb #tired #vacationwasted #mcgar30 #miami @ Port of…\n',
  'Oh Wynwood, you’re so funny! : @user #Wynwood #Art #ItWas3AM #Flowers #Vibes @ Wynwood…\n',
  'Been friends since 7th grade. Look at us now we all following our dreams doing what we love and…\n',
  'This is what it looks like when someone loves you unconditionally oh Puppy Brother. #htx…\n',
  "RT @user this white family was invited to a Black barbecue and i've never laughed so hard in my life\n"],
 [0, 1, 2, 3, 3])

## Wroking with the text

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
stop_words = list(set(stopwords.words('english')))  # only for unique values
stop_words[:10]

['through',
 "won't",
 "needn't",
 'wouldn',
 'your',
 'having',
 'further',
 'did',
 'ain',
 'we']

In [None]:
word_tokenize("hello world")

['hello', 'world']

In [None]:
import re
def preprocess_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'[^\w\s]', '', tweet)
    tweet = tweet.strip()
    return tweet

def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  tokens = word_tokenize(text) # same as split function
  filteredTokens = [token.lower() for token in tokens if token.lower() not in stop_words]
  filteredTokens = [preprocess_tweet(token) for token in filteredTokens if len(token) > 1]
  res = ' '.join(filteredTokens)
  return res

In [None]:
df["Tweets"] = df["TEXT"].apply(lambda x: remove_stopwords(x))

In [None]:
df["Tweets"]

0        vacation wasted vacation2017 photobomb tired v...
1        oh wynwood funny user wynwood art itwas3am flo...
2        friends since 7th grade look us following drea...
3        looks like someone loves unconditionally oh pu...
4        rt user white family invited black barbecue ve...
                               ...                        
69995    yes call galina  bubie  go follow beautiful fr...
69996                 sea seattle ballard seafood festival
69997    one daughters wearing asks ice cream 2am  wo n...
69998    guess whoop people homecoming asurams atlanta ...
69999           love robbie heritage memorial cemetery llc
Name: Tweets, Length: 70000, dtype: object

# Vectorization

In [None]:
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer

In [None]:
tweets, emojiId = get_tweets_emojiId(df, ["Tweets", "Label"])

In [None]:
tweets[:5], emojiId[:5]

(['vacation wasted vacation2017 photobomb tired vacationwasted mcgar30 miami port of',
  'oh wynwood funny user wynwood art itwas3am flowers vibes wynwood',
  'friends since 7th grade look us following dreams love and',
  'looks like someone loves unconditionally oh puppy brother htx',
  'rt user white family invited black barbecue ve never laughed hard life'],
 [0, 1, 2, 3, 3])

In [168]:
def create_sequences(text):
  tokenizer = Tokenizer(oov_token="<nothing>")
  tokenizer.fit_on_texts(tweets)
  seq = tokenizer.texts_to_sequences(tweets)
  seq = pad_sequences(seq, padding="post")
  return seq, tokenizer

In [169]:
X_train, tokenizer = create_sequences(tweets)
y_train = np.array(emojiId)

In [170]:
X_train.shape, y_train.shape

((70000, 32), (70000,))

# mapping label with there emojis

In [171]:
df_map = pd.read_csv("Mapping.csv")

In [162]:
df_map.head()

Unnamed: 0.1,Unnamed: 0,emoticons,number
0,0,😜,0
1,1,📸,1
2,2,😍,2
3,3,😂,3
4,4,😉,4


In [163]:
map = dict(zip(df_map["number"], df_map["emoticons"]))

In [164]:
map

{0: '😜',
 1: '📸',
 2: '😍',
 3: '😂',
 4: '😉',
 5: '🎄',
 6: '📷',
 7: '🔥',
 8: '😘',
 9: '❤',
 10: '😁',
 11: '🇺🇸',
 12: '☀',
 13: '✨',
 14: '💙',
 15: '💕',
 16: '😎',
 17: '😊',
 18: '💜',
 19: '💯'}

# Save

In [165]:
import json
import pickle

In [166]:
# Save X_train
with open("/content/drive/MyDrive/Colab Notebooks/Emoji Prediction/X_train.pkl", "wb") as f:
    pickle.dump(X_train, f)

# Save y_train
with open("/content/drive/MyDrive/Colab Notebooks/Emoji Prediction/y_train.pkl", "wb") as f:
    pickle.dump(y_train, f)

# Save tokenizer
with open("/content/drive/MyDrive/Colab Notebooks/Emoji Prediction/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [167]:
# Save emoji map
with open("/content/drive/MyDrive/Colab Notebooks/Emoji Prediction/emojiMap.json", "w") as f:
    json.dump(map, f)