# Emotions Text Classifier Project (Preprocessing Notebook)

### Project Prerequisites

In [1]:
import spacy
import numpy as np
import pandas as pd
from transformers import pipeline
from matplotlib import pyplot as pl

### Preparing the Data

In [2]:
df = pd.read_csv("https://github.com/abishekarun/Text-Emotion-Classification/blob/master/text_emotion.csv?raw=true")
df.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [3]:
df["content"] = df["content"].sample(1000)

#### Checking for NaN's in the Dataframe

In [4]:
print(df.isnull().sum())
df.dropna(inplace=True)
print(df.isnull().sum())

tweet_id         0
sentiment        0
author           0
content      39000
dtype: int64
tweet_id     0
sentiment    0
author       0
content      0
dtype: int64


#### Dropping unwanted columns 

In [5]:
df.drop(columns = ["tweet_id","sentiment","author"],inplace=True)
df.head()

Unnamed: 0,content
21,"Wondering why I'm awake at 7am,writing a new s..."
35,Screw you @davidbrussee! I only have 3 weeks...
75,I miss Voobys!
126,Need to pack for CALI CALI! Cannot waittt! Thi...
139,"I cant sleep, but im too sore to move"


### Preprocessing

In [6]:
def Preprocessing(x):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(x)
    processed_content = " ".join([token.lemma_ for token in doc])
    return processed_content

In [7]:
# df["processed_content"] = df["content"].apply(Preprocessing)
# df["processed_content"].to_csv("Data/processed_text.csv",index_label=False)
processed_text_df = pd.read_csv("Data/processed_text.csv")
processed_text_df.head()

Unnamed: 0,processed_content
1,Layin n bed with a headache ughhhh ... waiti...
3,want to hang out with friend soon !
21,"wonder why I be awake at 7am , write a new son..."
83,goooood mooorne people ... sun be out .. defin...
127,be miserable I feel like I m gona cry sux !


In [14]:
clf = pipeline("text-classification",model = "j-hartmann/emotion-english-distilroberta-base")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at j-hartmann/emotion-english-distilroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [16]:
def get_emotion(x):
    return(clf(x)[0]["label"])

In [24]:
processed_text_df["emotion"] = processed_text_df["processed_content"].apply(get_emotion)
processed_text_df.head()

Unnamed: 0,processed_content,emotion
1,Layin n bed with a headache ughhhh ... waiti...,sadness
3,want to hang out with friend soon !,joy
21,"wonder why I be awake at 7am , write a new son...",anger
83,goooood mooorne people ... sun be out .. defin...,sadness
127,be miserable I feel like I m gona cry sux !,sadness


In [25]:
processed_text_df["emotion"].to_csv("Data/processed_emotions.csv",index_label=False)