In [1]:
# importing needed packages here

import os
import re
import spacy
import hashlib
import numpy as np
import pandas as pd
import seaborn as sns

from wordcloud import WordCloud
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter
from spacy.matcher import Matcher
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from nltk.tokenize import WordPunctTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, 
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

SyntaxError: trailing comma not allowed without surrounding parentheses (4035637606.py, line 21)

# Objective

# Context

# Importing the data

In [None]:
#Loading the required data
data_path = ""
df = pd.read_csv(data_path)

In [None]:
df.head()

# Loading spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

# here upload stopwords from nltk but it can be from spacy
# you can also add or remove stopwords
# Adding single token as stopword → nlp.Defaults.stop_words.add(“perfect”)
# Adding multiple tokens → nlp.Defaults.stop_words|={“hot”,”cobb”}

# Removing single token →nlp.Defaults.stop_words.remove(“what”)
# Removing multiple tokens → nlp.Defaults.stop_words -= {“who”, “when”}

en_stopwords = nlp.Defaults.stop_words

In [None]:
# As in the exercise notebook BLU09 to merge the text together as a list

#nlp.add_pipe("merge_entities", after="ner")

# Let's get the text of the news article processed by SpaCy - This might take a while depending on 
#   your hardware (a break to walk the dog? 🐶)

#docs = list(tqdm(nlp.pipe(df["text"], batch_size=20, n_process=cpu_count-1), total=len(df["text"])))
#docs[:3]

# Text Cleaning

In [None]:
def remove_punctuation(text):
    """
    Hint: Remember the good old RegEx from 2 LUs ago
        how can I just remove everything except words, digits and spaces?
    """
    
    # text = re.sub(...)
    
    # YOUR CODE HERE
    text = re.sub(r"[^\w\d\s]+", "", text)

    return text.lower()


def remove_stopwords(text, stopwords):
    """
    Hint: You may want to split the text into tokens using the tokenizer, it might help when searching for stopwords
        If you do, do not forget to join the tokens afterwards!
    """
    
    # YOUR CODE HERE
    tokenizer = WordPunctTokenizer()
    BoW = [words for words in tokenizer.tokenize(text) if words not in stopwords]
    text_processed = " ".join(BoW)
    
    # Return the full string again here
    return text_processed

In [None]:
def preprocess_text(df, column):
    
    df_processed = df.copy()
    
    df_processed[column] = df_processed[column].apply(remove_punctuation)
    
    
    df_processed[column] = df_processed[column].apply(remove_stopwords, stopwords = en_stopwords)
    
    return df_processed

# Establishing a baseline model

In [None]:
def train_split(df_processed, text=None, label=None):
    
    return X_train, X_test, y_train, y_test = train_test_split(df_processed[text], df_processed[label], 
                                                    test_size=0.2, random_state=42, stratify=df_processed[label])

In [None]:
def baseline_with_tfidf(X_train, X_test, y_train, y_test):
    """
    Train a Random Forest using sklearn's Pipeline and return the trained model and its accuracy in the test set.
    """
    
    # pipe = (...)
    # pipe.fit(...)
    # (...)
    
    # YOUR CODE HERE
    # important: only train the vectorizer on the training data (but with pipeline no problems with it)
    # we can use for classifier: MultinomialNB(), SVC(), GradientBoostingClassifier()
    pipe = Pipeline([("Tfidf", TfidfVectorizer()), # ngram_range(1,2) can be added
                    ("classifier", RandomForestClassifier())])
    pipe.fit(X_train, y_train)
    
    y_pred = pipe.predict(X_test)
    y_prob = model.predict_proba(X_test)
    acc = round(accuracy_score(y_test, y_pred),3)
    f1 = round(f1_score(y_true, y_pred, average='weighted'),3) # use "weighted" for umbalanced data otherwise use ="macro"
    precision = round(precision_score(y_true, y_pred, average='weighted'),3)
    recall = round(recall_score(y_true, y_pred, average='weighted'),3)
    
    return pipe, y_pred, y_prob, acc, f1, precision, recall

In [None]:
pipe, y_pred, y_prob, acc, f1, precision, recall = baseline_with_tfidf(train_split(preprocess_text(df, column), text=df[""], label=df[""])

In [None]:
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
#In this matrix, the first row represents the negative class and the second row represents the positive class.
# The first column represents the instances that were predicted to be negative 
# and the second column represents the instances that were predicted to be positive.

labels = ['Negative', 'Positive'] # be careful with the order of the label
ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
ax.set_xlabel("Predicted Labels")
ax.set_ylabel("True Labels")
plt.show()

In [None]:
def cm_roc_curve(y_test, y_pred, acc, precision, recall, f1):
    print(f'Accuracy of the model: {np.round(acc*100,2)}%')
    print(f'Precision Score of the model: {np.round(precision*100,2)}%')
    print(f'Recall Score of the model: {np.round(recall*100,2)}%')
    print(f'F1 Score of the model: {np.round(f1*100,2)}%')
    print('-'*50)
    print(classification_report(y_test,y_pred))
    
    fig, ax = plt.subplots(1, 2, figsize = (25,  8))
    ax1 = plot_confusion_matrix(y_test, y_pred, ax= ax[0], cmap= 'YlGnBu')
    ax2 = plot_roc(y_test, y_prob, ax= ax[1], plot_macro= False, plot_micro= False, cmap= 'summer')

In [None]:
cm_roc_curve(y_test, y_pred, acc, precision, recall, f1)

# Data visualization and Features engineering 

## Data visualization

In [None]:
df[label_column].value_counts().plot(kind='bar')

# add a title and labels to the plot
plt.title('Count of label column')
plt.xlabel('Value')
plt.ylabel('Count')

# show the plot
plt.show()

In [None]:
df.isnull().sum()

### Wordcloud

 To change the column name in this code and maybe remore the stopwords (look at wordcloud function)

In [None]:
# add stopwords= parameters in the wordcloud

def nonan(x):
    if type(x) == str:
        return x.replace("\n", "")
    else:
        return ""

text = ' '.join([nonan(abstract) for abstract in df["comment_text"]])
wordcloud = WordCloud(max_font_size=None, background_color='black', collocations=False,
                      width=1200, height=1000).generate(text)
fig = px.imshow(wordcloud)
fig.update_layout(title_text='Common words in comments')

## Features engineering

In [None]:
#Generating the column text_len
df['text_len']=df_train['text'].apply(lambda x:len(x.split()))

In [None]:
#The below function comes in handy to count the number of characters in a text
def char_count(text):
    charc=0
    for char in text.split():
        charc +=len(char)
    return charc

#Generating the column text_char_len
df'text_char_len']=df['text'].apply(char_count)

# Feature Union and pipeline

In [4]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    

class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]
    
    
class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[[self.key]]

## The following pipeline and Feature union need to be changed acording to the data

In [None]:
text_pipe = Pipeline([("selector", TextSelector(key="text")),
                      ("tfidf", TfidfVectorizer())])

nb_adj_adv_pipe = Pipeline([("selector", NumberSelector(key="nb_adj_adv")),
                            ("standard", StandardScaler())])

nb_words_pipe = Pipeline([("selector", NumberSelector(key="nb_words")),
                          ("standard", StandardScaler())])

doc_length_pipe = Pipeline([("selector", NumberSelector(key="doc_length")),
                            ("standard", StandardScaler())])

avg_word_length_pipe = Pipeline([("selector", NumberSelector(key="avg_word_length")),
                                 ("standard", StandardScaler())])

In [None]:
feats = FeatureUnion([("text", text_pipe), 
                      ("nb_adj_adv", nb_adj_adv_pipe),
                     ("nb_words", nb_words_pipe),
                     ("doc_length", doc_length_pipe),
                     ("avg_word_length", avg_word_length_pipe)])

In [None]:
def improved_pipeline(feats, X_train, X_test, y_train, y_test):
    """
    Train a Random Forest using sklearn's Pipeline and return the trained model and its accuracy in the test set.
    Don't forget to add the feats to the Pipeline!
    """
    
    # pipe = (...)
    # pipe.fit(...)
    # (...)
    
    # YOUR CODE HERE
    # important: only train the vectorizer on the training data (but with pipeline no problems with it)
    # we can use for classifier: MultinomialNB(), BernoulliNB(), SVC(), GradientBoostingClassifier(), 
    # LogisticRegression()
    pipe = Pipeline([('feats', feats),
                     ('clf', RandomForestClassifier())])
    
    pipe.fit(X_train, y_train)
    
    y_pred = pipe.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted') # use "weighted" for umbalanced data otherwise use ="macro"
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    
    return pipe, y_pred, y_prob, acc, f1, precision, recall

In [None]:
pipe, pipe, y_pred, y_prob, acc, f1, precision, recall = improved_pipeline(feats, train_split(preprocess_text(df, column), text=df[""], label=df[""]))

In [None]:
cm = confusion_matrix(y_true, y_pred)

# Plot the confusion matrix
#In this matrix, the first row represents the negative class and the second row represents the positive class.
# The first column represents the instances that were predicted to be negative 
# and the second column represents the instances that were predicted to be positive.

labels = ['Negative', 'Positive'] # be careful with the order of the label
ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
ax.set_xlabel("Predicted Labels")
ax.set_ylabel("True Labels")
plt.show()

In [None]:
cm_roc_curve(y_test, y_pred, acc, precision, recall, f1)