In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Load the dataset (replace with the correct path)
news_dataset = pd.read_csv('train.csv')

# Fill NaN values in the 'text' column with an empty string
news_dataset['text'] = news_dataset['text'].fillna('')

# Splitting data into features (X) and labels (Y)
X = news_dataset['text']
Y = news_dataset['label']

# Vectorizing the text data
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(X)

# Splitting into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Training the model
model = LogisticRegression()
model.fit(X_train, Y_train)

# Saving the model and vectorizer to disk
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

print("Model and vectorizer saved as 'model.pkl' and 'vectorizer.pkl'.")


Model and vectorizer saved as 'model.pkl' and 'vectorizer.pkl'.
