# CS4248 NLP Project Team 18

## Installing necessary libraries

*Note*: Put libraries that need to be installed with `!pip install LIBRARY` so that we can ensure consistency in packages

In [None]:
!pip install scikit-learn
!pip install pandas
!pip install matplotlib
!pip install tensorflow
!pip install shap
!pip install nltk
!pip install textstat
!pip install textblob

## Reading Data

In [None]:
import pandas as pd

DATA_FILEPATH = "raw_data/fulltrain.csv"

df = pd.read_csv(DATA_FILEPATH, header=None)
df.head()

In [None]:
X = df[1]
y = df[0]

print(X.head())
print(y.head())

## Data Analysis

In [None]:
import matplotlib.pyplot as plt

value_counts = df[0].value_counts()
value_counts.plot(kind='bar')
plt.xlabel('Classification')
plt.ylabel('Count')
plt.title('Breakdown of Text Classification')
plt.show()

## Preprocessing

## Feature Engineering

In [None]:
import pandas as pd
import textstat
from nltk.tokenize import word_tokenize
from textblob import TextBlob

def normalize(data):
    return (data-data.mean())/data.std()

def count_tokens(sentence):
    return len(word_tokenize(sentence))

def get_polarity(sentence):
    return TextBlob(sentence).sentiment.polarity

def get_subjectivity(sentence):
    return TextBlob(sentence).sentiment.subjectivity

def add_features(data):
    
    # Add features
    token_count = data.apply(lambda x: count_tokens(x))
    readability = data.apply(lambda x: textstat.flesch_reading_ease(x))
    polarity = data.apply(lambda x: get_polarity(x))
    subjectivity = data.apply(lambda x: get_subjectivity(x))
    
    # Normalize features
    token_count = normalize(token_count)
    readability = normalize(readability)
    polarity = normalize(polarity)
    subjectivity = normalize(subjectivity)
    
    return pd.concat([token_count, readability, polarity, subjectivity], axis=1)


train = pd.read_csv('balancedtest.csv', header=None)

x_train = train[train.columns[1]]
y_train = train[train.columns[0]]

train_features = add_features(x_train)

print(train_features)

## Model Training & Evaluation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = 10000

vectorizer = TfidfVectorizer(max_features=max_features)
tfidf_matrix = vectorizer.fit_transform(X)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

## Settings for train test split

In [None]:
## Settings

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

## Note: Change this to fit the algorithm below
# X are the features
X = tfidf_df
# y are the outputs
y = y
# test_size is the size of the test (0 < test_size < 1)
test_size = 0.2
# seed for random split
seed = 40
## End of Note

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

### Logistic Regression

In [None]:

model = LogisticRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

evaluation_metrics = [("Accuracy", accuracy_score), ("Confusion Matrix", confusion_matrix)]

for evaluation_metric_name, evaluation_metric_func in evaluation_metrics:
    print(f"{evaluation_metric_name}:\n{evaluation_metric_func(y_test, y_pred)}")
    

In [None]:
evaluation_metrics = [("Accuracy", accuracy_score), ("Confusion Matrix", confusion_matrix)]

for evaluation_metric_name, evaluation_metric_func in evaluation_metrics:
    print(f"{evaluation_metric_name}:\n{evaluation_metric_func(y_test, y_pred)}")
f1_score(y_test, y_pred, average="macro")

## CNN

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
model = Sequential([
    Conv2D(filters=32, kernel_size=(3, 3), activation='relu', input_shape=(image_height, image_width, num_channels)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(units=128, activation='relu'),
    Dense(units=num_classes, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32)
loss, accuracy = model.evaluate(X_test, y_test)


## RNN

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, Dense

X_train_array = X_train.to_numpy()
X_test_array = X_test.to_numpy()
y_train_array = y_train.to_numpy()
y_test_array = y_test.to_numpy()

num_samples, num_features = X_train_array.shape
timesteps = 1  # You may need to adjust this depending on your data

X_train_reshaped = X_train_array.reshape(num_samples, timesteps, num_features)
X_test_reshaped = X_test_array.reshape(X_test_array.shape[0], timesteps, X_test_array.shape[1])


# create model here
model = tf.keras.Sequential([
    SimpleRNN(units=32, input_shape=(timesteps, num_features)),
    Dense(units=4)
])
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
model.fit(X_train_reshaped, y_train, epochs=100, batch_size=32)
loss, accuracy = model.evaluate(X_test_reshaped, y_test)

## LSTM

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from sklearn.metrics import f1_score
import numpy as np

# Assuming df is your pandas DataFrame with a column named 'text' containing the sentences
# and a column named 'label_column' containing the labels
# Example DataFrame:
# df = pd.DataFrame({'text': ["This is sentence 1.", "Another sentence.", "Yet another sentence."],
#                    'label_column': [0, 1, 1]})

X = df[1].values
y = df[0].values

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Padding
max_sequence_length = 100 # max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Convert to Numeric
X = padded_sequences
y = y

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM Model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100  # Adjust as needed
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=128, implementation=2))  # Disable CuDNN
model.add(Dense(units=1, activation='sigmoid'))

# Compile Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define a data generator
def data_generator(X, y, batch_size):
    num_samples = X.shape[0]
    while True:
        indices = np.random.permutation(np.arange(num_samples))
        for i in range(0, num_samples, batch_size):
            batch_indices = indices[i:i+batch_size]
            yield X[batch_indices], y[batch_indices]

# Train Model with Data Generator
batch_size = 32
steps_per_epoch = len(X_train) // batch_size
train_generator = data_generator(X_train, y_train, batch_size)

# Train Model with Generator
model.fit(train_generator, epochs=10, steps_per_epoch=steps_per_epoch, validation_data=(X_test, y_test))

# Evaluate Model
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print("F1 score (macro):", f1)



# Evaluation [unable to see other feature names]

In [None]:
import shap

explainer = shap.Explainer(model, X_train)
shap_values = explainer.shap_values(X_test[50:100])
shap.summary_plot(shap_values, X_test[50:100], max_display=6)