# Importing Libraries

In [28]:
from read_data import combine_all, combine_category, read_data

In [29]:
from textblob import TextBlob

# Scrapped Data

In [30]:
df=combine_all()

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1101345 entries, 0 to 60532
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   Name         1101261 non-null  object
 1   Comment      1101078 non-null  object
 2   Time         1101345 non-null  object
 3   Likes        1101345 non-null  int64 
 4   Reply Count  1101345 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 50.4+ MB


# Data Cleaning

In [32]:
df["Comment"]=df["Comment"].astype(str)

In [33]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# Performing Sentiment Analysis

In [43]:
def sentiment_analysis(text):
    def getSubjectivity(text):
        return TextBlob(text).sentiment.subjectivity
    
    def getPolarity(text):
       return TextBlob(text).sentiment.polarity

    df['Subjectivity'] = df["Comment"].apply(getSubjectivity)
    df['Polarity'] = df["Comment"].apply(getPolarity)
    def getAnalysis(score):
        if score < 0:
            return 'Negative'
        elif score == 0:
            return 'Neutral'
        else:
            return 'Positive'
    df['Analysis'] = df['Polarity'].apply(getAnalysis)
    return df

In [44]:
sentiment_analysis(df["Comment"])

Unnamed: 0,Name,Comment,Time,Likes,Reply Count,Subjectivity,Polarity,TextBlob_Analysis
0,MrBeast,"Like I said in the video, subscribe if you hav...",2021-03-27T23:31:32Z,260829,419,0.400000,1.0000,Positive
1,Alisha Gouker,Looks scary,2022-04-02T23:34:40Z,0,0,1.000000,-0.5000,Negative
2,Lea Wodi,How does it work with breathing there? I mean ...,2022-04-02T23:20:34Z,0,0,0.687500,-0.3125,Negative
3,Alex gamer,Oxygen ?,2022-04-02T21:19:16Z,0,0,0.000000,0.0000,Neutral
4,Deedee Brown,Oh my don't know if I could do that,2022-04-02T20:43:11Z,0,0,0.000000,0.0000,Neutral
...,...,...,...,...,...,...,...,...
60528,Poke ax,Wow,2021-04-07T18:45:37Z,11,9,1.000000,0.1000,Positive
60529,Friday game week,first,2021-04-07T18:45:37Z,7,26,0.333333,0.2500,Positive
60530,Zaptix,first,2021-04-07T18:45:36Z,16,14,0.333333,0.2500,Positive
60531,ROTNOX,first,2021-04-07T18:45:36Z,10,11,0.333333,0.2500,Positive
