In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check for null values and drop them
train_data = train_data.dropna(subset=['text', 'target'])
test_data = test_data.dropna(subset=['text'])

# Text preprocessing function
def clean_text(text):
    import re
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'www\S+', '', text)  # Remove URLs starting with www
    text = re.sub(r'\@\w+', '', text)   # Remove mentions
    text = re.sub(r'\#\w+', '', text)   # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text)     # Remove numbers
    text = text.strip()
    return text

# Apply text cleaning
train_data['text'] = train_data['text'].apply(clean_text)
test_data['text'] = test_data['text'].apply(clean_text)

# Features and labels
X = train_data['text']
y = train_data['target']


In [19]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features

# Fit on training data and transform
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform validation and test sets
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(test_data['text'])


In [21]:
# Initialize and train Logistic Regression
model = LogisticRegression(random_state=42)
model.fit(X_train_tfidf, y_train)

# Save the trained model and vectorizer
joblib.dump(model, 'logistic_regression_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [22]:
# Predict on validation set
y_val_pred = model.predict(X_val_tfidf)

# Print metrics
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))


Validation Accuracy: 0.8030203545633617

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.88      0.84       874
           1       0.82      0.69      0.75       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



In [23]:
# Predict on test set
test_data['predictions'] = model.predict(X_test_tfidf)

# Save predictions to a CSV file
test_data[['id', 'predictions']].to_csv('test_predictions.csv', index=False)
