In [None]:
import pandas as pd
import re
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
train_df = pd.read_csv("E:/Kaggle/nlp-getting-started/train.csv")
test_df = pd.read_csv("E:/Kaggle/nlp-getting-started/test.csv")

In [None]:
train_df.head()

In [None]:
def clean_text_with_textblob(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text).lower()
    corrected_text = str(TextBlob(text).correct())
    return corrected_text

def preprocess_text(text):
    text = clean_text_with_textblob(text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [None]:
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

In [None]:
X = train_df['cleaned_text']
y = train_df['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(test_df['cleaned_text'])

In [None]:
model = LogisticRegression(random_state=42, max_iter=200)
model.fit(X_train_tfidf, y_train)

In [None]:
y_val_pred = model.predict(X_val_tfidf)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

In [None]:
test_df['predictions'] = model.predict(X_test_tfidf)

In [None]:
submission = test_df[['id', 'predictions']].rename(columns={'predictions': 'target'})
submission.to_csv('submission.csv', index=False)