In [35]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, GlobalMaxPooling1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# I created a small data for this assessment , data is small and imbalanced But
# Approach and method are best i use everything for model enhancement 

In [36]:
df = pd.read_csv('Input_Data.csv')

In [37]:
df.head(5)

Unnamed: 0,News Title,Category,Sentiment,Aspect
0,Most economical spells at T20 World Cup - Whic...,Sports,Neutral,Economy
1,"Four overs, three wickets, no run: Lockie Ferg...",Sports,Positive,Achievement
2,Economical Alternative to Traditional Cleanroo...,Economy,Positive,Economics
3,T20 WC: Most economical four-over spells by In...,Sports,Neutral,Cricket
4,This 2002 Cirrus SR22 G1 Is a Surprisingly Eco...,Economical,Positive,Economy


In [38]:
# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [39]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['processed_title'] = df['News Title'].apply(preprocess_text)

In [40]:
# Encoding target variables
label_encoder_category = LabelEncoder()
label_encoder_sentiment = LabelEncoder()

df['category_encoded'] = label_encoder_category.fit_transform(df['Category'])
df['sentiment_encoded'] = label_encoder_sentiment.fit_transform(df['Sentiment'])

In [41]:
# Splitting data
X = df['processed_title']
y_category = df['category_encoded']
y_sentiment = df['sentiment_encoded']

In [42]:
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X, y_category, test_size=0.2, random_state=42)
X_train_sent, X_test_sent, y_train_sent, y_test_sent = train_test_split(X, y_sentiment, test_size=0.2, random_state=42)

In [43]:
# Tokenization and padding sequences
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train_cat)

In [44]:
X_train_cat_seq = tokenizer.texts_to_sequences(X_train_cat)
X_test_cat_seq = tokenizer.texts_to_sequences(X_test_cat)

X_train_sent_seq = tokenizer.texts_to_sequences(X_train_sent)
X_test_sent_seq = tokenizer.texts_to_sequences(X_test_sent)

In [45]:
max_length = 50
X_train_cat_padded = pad_sequences(X_train_cat_seq, maxlen=max_length, padding='post', truncating='post')
X_test_cat_padded = pad_sequences(X_test_cat_seq, maxlen=max_length, padding='post', truncating='post')

X_train_sent_padded = pad_sequences(X_train_sent_seq, maxlen=max_length, padding='post', truncating='post')
X_test_sent_padded = pad_sequences(X_test_sent_seq, maxlen=max_length, padding='post', truncating='post')

In [46]:
# Category Prediction Model
model_category = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(df['Category'].unique()), activation='softmax')
])

model_category.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_category.summary()

# Train the Category Prediction Model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history_category = model_category.fit(X_train_cat_padded, y_train_cat, epochs=20, batch_size=32,
                                     validation_data=(X_test_cat_padded, y_test_cat), callbacks=[early_stopping])

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 50, 64)            640000    
                                                                 
 bidirectional_7 (Bidirecti  (None, 50, 128)           66048     
 onal)                                                           
                                                                 
 global_max_pooling1d_7 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_11 (Dense)            (None, 64)                8256      
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_12 (Dense)            (None, 5)                

In [47]:
# Sentiment Prediction Model
model_sentiment = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(df['Sentiment'].unique()), activation='softmax')
])

model_sentiment.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_sentiment.summary()

# Train the Sentiment Prediction Model
history_sentiment = model_sentiment.fit(X_train_sent_padded, y_train_sent, epochs=20, batch_size=32,
                                       validation_data=(X_test_sent_padded, y_test_sent), callbacks=[early_stopping])


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 50, 64)            640000    
                                                                 
 bidirectional_8 (Bidirecti  (None, 50, 128)           66048     
 onal)                                                           
                                                                 
 global_max_pooling1d_8 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_13 (Dense)            (None, 64)                8256      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_14 (Dense)            (None, 3)                

In [49]:
# Function to predict category and sentiment for a new title
def predict_category_sentiment(title):
    processed_title = preprocess_text(title)
    seq = tokenizer.texts_to_sequences([processed_title])
    padded = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')
    category_encoded = model_category.predict(padded).argmax(axis=-1)[0]
    sentiment_encoded = model_sentiment.predict(padded).argmax(axis=-1)[0]
    category = label_encoder_category.inverse_transform([category_encoded])[0]
    sentiment = label_encoder_sentiment.inverse_transform([sentiment_encoded])[0]
    return category, sentiment

# Example prediction
new_title = "Lockie Ferguson produces most economical spell in T20 World Cup history"
predicted_category, predicted_sentiment = predict_category_sentiment(new_title)
print(f"Predicted Category: {predicted_category}, Predicted Sentiment: {predicted_sentiment}")

Predicted Category: Politics, Predicted Sentiment: Positive
