In [1]:
%pip install scikit-learn keras tensorflow pandas matplotlib
%pip install numpy pandas nltk datasketch

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Load and Preprocess the Datasets

In [1]:
import pandas as pd

# Load the train and test datasets
train_data = pd.read_csv('train.csv', sep=',')
test_data = pd.read_csv('test_without_labels.csv', sep=',')

print(train_data.head())

       Id                                              Title  \
0  227464  Netflix is coming to cable boxes, and Amazon i...   
1  244074  Pharrell, Iranian President React to Tehran 'H...   
2   60707                    Wildlife service seeks comments   
3   27883  Facebook teams up with Storyful to launch 'FB ...   
4  169596           Caesars plans US$880 mln New York casino   

                                             Content          Label  
0   if you subscribe to one of three rinky-dink (...  Entertainment  
1   pharrell, iranian president react to tehran '...  Entertainment  
2   the u.s. fish and wildlife service has reopen...     Technology  
3   the very nature of social media means it is o...     Technology  
4   caesars plans us$880 mln new york casino jul ...       Business  


In [None]:
import nltk
from nltk.corpus import stopwords

# Load stopwords from nltk and convert to set for fast lookup
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [3]:
import re

# Preprocessing function - Remove punctuation, numbers and stopwords
def preprocess_text(text):
    if pd.isna(text):   # Handle NaN cases
        return ''
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = re.sub(r'\d+', ' ', text)  # Remove numbers
    text = [word for word in text.split() if word not in stop_words]  # Remove stopwords
    text = ' '.join(text)
    return text

In [4]:
# Combine 'Title' and 'Content' to one column (give Title 5 times more weight) - apply preprocessing
X_train = (5 * (train_data['Title'] + ' ') + train_data['Content']).apply(preprocess_text)
y_train = train_data['Label'].to_numpy()

X_test = (5 * (test_data['Title'] + ' ') + test_data['Content']).apply(preprocess_text)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Numerical representation (bag of words)
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

## 5-Fold Cross-Validation

In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
from sklearn.svm import SVC

# 5-Fold Cross-Validation SVM
svm = SVC()
svm_scores = cross_val_score(svm, X_train_vectorized, y_train, cv=5)
print('SVM Accuracy (5-Fold CV on Subset):', svm_scores.mean())

SVM Accuracy (5-Fold CV on Subset): 0.9430654578162029


In [8]:
from sklearn.ensemble import RandomForestClassifier

# 5-Fold Cross-Validation Random Forest
rf = RandomForestClassifier()
rf_scores = cross_val_score(rf, X_train_vectorized, y_train, cv=5)
print('Random Forest Accuracy (5-Fold CV on Subset):', rf_scores.mean())

Random Forest Accuracy (5-Fold CV on Subset): 0.9212487141643187


## Train and Predict

In [9]:
# Train with SVM
svm.fit(X_train_vectorized, y_train) 

# Predict the labels for the test data
predictions = svm.predict(X_test_vectorized)

# Write result to file (for Kaggle)
output_df = pd.DataFrame({'Id': test_data['Id'], 'Predicted': predictions})
output_df.to_csv('testSet_categories_svm_prep.csv', index=False)
print("Output file 'testSet_categories_svm_prep.csv' created successfully.")

Output file 'testSet_categories_svm_prep.csv' created successfully.


In [10]:
# Train with Random Forest
rf.fit(X_train_vectorized, y_train)

# Predict the labels for the test data
predictions = rf.predict(X_test_vectorized)

# Write result to file (for Kaggle)
output_df = pd.DataFrame({'Id': test_data['Id'], 'Predicted': predictions})
output_df.to_csv('testSet_categories_rf_prep.csv', index=False)
print("Output file 'testSet_categories_rf_prep.csv' created successfully.")

Output file 'testSet_categories_rf_prep.csv' created successfully.
