In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the cleaned dataset
data = pd.read_csv("train_set.csv")

# Print dataset columns to check available labels
print("Dataset Columns:", data.columns)

# Assuming the correct label column is 'type' based on typical datasets
label_column = 'type' if 'type' in data.columns else 'label'

# Drop rows with NaN values in label column
data = data.dropna(subset=[label_column])

# Ensure text data is preprocessed
data['processed_text'] = data['content'].fillna('')

# Feature Extraction (Bag of Words & TF-IDF)
vectorizer = CountVectorizer(max_features=10000)
X_train_vec = vectorizer.fit_transform(data['processed_text'])

tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(data['processed_text'])

# Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train_vec, data[label_column])
y_pred = log_reg.predict(X_train_vec)
print("Logistic Regression Performance:")
print(classification_report(data[label_column], y_pred))

# Load validation and test datasets
val_data = pd.read_csv("val_set.csv", dtype=str)
test_data = pd.read_csv("test_set.csv", dtype=str)

# Apply the same processing to validation and test datasets
for dataset_name in ["val_set.csv", "test_set.csv"]:
    dataset = pd.read_csv(dataset_name, dtype=str)
    dataset = dataset.dropna(subset=[label_column])
    dataset[label_column] = dataset[label_column].astype(str)
    dataset['processed_text'] = dataset['content'].fillna('')
    
    if dataset_name == "val_set.csv":
        val_data = dataset
        X_val_vec = vectorizer.transform(val_data['processed_text'])
    else:
        test_data = dataset
        X_test_vec = vectorizer.transform(test_data['processed_text'])

# Logistic Regression Model with increased max_iter
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train_vec, data[label_column])
y_pred = log_reg.predict(X_train_vec)
print("Logistic Regression Performance:")
print(classification_report(data[label_column], y_pred, zero_division=0))

# Evaluate Logistic Regression Model on validation and test sets
y_val_pred = log_reg.predict(X_val_vec)
y_test_pred = log_reg.predict(X_test_vec)

print("Validation Set Performance:")
print(classification_report(val_data[label_column], y_val_pred, zero_division=0))

print("Test Set Performance:")
print(classification_report(test_data[label_column], y_test_pred, zero_division=0))



[nltk_data] Downloading package punkt to /Users/theo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/theo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  data = pd.read_csv("train_set.csv")


Dataset Columns: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Performance:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                            precision    recall  f1-score   support

2018-02-10 13:43:39.521661       0.00      0.00      0.00         1
                      bias       0.74      0.68      0.71    106564
                 clickbait       0.71      0.37      0.49     21897
                conspiracy       0.71      0.65      0.68     77782
                      fake       0.77      0.73      0.75     83963
                      hate       0.79      0.59      0.67      7026
                   junksci       0.73      0.51      0.60     11280
                 political       0.63      0.79      0.70    155644
                  reliable       0.83      0.91      0.87    174833
                     rumor       0.87      0.88      0.87     45138
                    satire       0.63      0.32      0.43     10475
                   unknown       0.70      0.45      0.54     34780
                unreliable       0.94      0.78      0.85     28285

                  accuracy                    

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Performance:
                            precision    recall  f1-score   support

2018-02-10 13:43:39.521661       0.00      0.00      0.00         1
                      bias       0.77      0.70      0.73    106564
                 clickbait       0.72      0.45      0.55     21897
                conspiracy       0.73      0.67      0.70     77782
                      fake       0.78      0.75      0.77     83963
                      hate       0.87      0.70      0.78      7026
                   junksci       0.79      0.65      0.71     11280
                 political       0.65      0.80      0.72    155644
                  reliable       0.86      0.92      0.88    174833
                     rumor       0.89      0.91      0.90     45138
                    satire       0.71      0.49      0.58     10475
                   unknown       0.72      0.51      0.59     34780
                unreliable       0.95      0.82      0.88     28285

             

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import swifter  # Faster parallel apply
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load stopwords once to improve performance
stop_words = set(stopwords.words('english'))

# Text Preprocessing Function
def preprocess_text(text):
    """
    Cleans and preprocesses text data to improve model performance.
    - Converts to lowercase
    - Removes non-word characters (punctuation, symbols)
    - Tokenizes text into words
    - Removes common stopwords
    """
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    words = word_tokenize(text)  # Tokenize into words
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

# Load and process training dataset
data = pd.read_csv("train_set.csv", dtype=str)
label_column = 'type' if 'type' in data.columns else 'label'  # Identify the label column
data = data.dropna(subset=[label_column])  # Remove rows with missing labels
data[label_column] = data[label_column].astype(str)  # Ensure labels are strings
data['processed_text'] = data['content'].fillna('').swifter.apply(preprocess_text)  # Apply text preprocessing

# Load and process validation dataset
val_data = pd.read_csv("val_set.csv", dtype=str)
val_data = val_data.dropna(subset=[label_column])
val_data[label_column] = val_data[label_column].astype(str)
val_data['processed_text'] = val_data['content'].fillna('').swifter.apply(preprocess_text)

# Load and process test dataset
test_data = pd.read_csv("test_set.csv", dtype=str)
test_data = test_data.dropna(subset=[label_column])
test_data[label_column] = test_data[label_column].astype(str)
test_data['processed_text'] = test_data['content'].fillna('').swifter.apply(preprocess_text)

# Feature Extraction using TF-IDF (Increasing max features and adding n-grams)
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(data['processed_text'])
X_val_tfidf = tfidf_vectorizer.transform(val_data['processed_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['processed_text'])

# Define Logistic Regression Model
log_reg = LogisticRegression(class_weight='balanced')

# Define Hyperparameter Grid for Optimization
param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'saga'],
    'max_iter': [300, 500, 1000]  # Number of iterations
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(log_reg, param_grid, scoring='f1_weighted', cv=3, n_jobs=-1)
grid_search.fit(X_train_tfidf, data[label_column])

# Best Model after Hyperparameter Tuning
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Generate Predictions
y_val_pred = best_model.predict(X_val_tfidf)
y_test_pred = best_model.predict(X_test_tfidf)

# Evaluate Model Performance
print("Validation Set Performance:")
print(classification_report(val_data[label_column], y_val_pred, zero_division=0))

print("Test Set Performance:")
print(classification_report(test_data[label_column], y_test_pred, zero_division=0))