In [None]:
#importing important libraries
import pandas as pd
import re
from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import joblib
import nltk
from multiprocessing import Pool


In [None]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


In [None]:
# Load datasets
questions_df = pd.read_csv('Questions.csv', encoding='latin1')
tags_df = pd.read_csv('Tags.csv', encoding='latin1')


In [None]:
# Merge questions and tags
questions_tags_df = pd.merge(questions_df, tags_df, how='inner', on='Id')


In [None]:
# Combine title and body into one text field
questions_tags_df['Text'] = questions_tags_df['Title'] + " " + questions_tags_df['Body']

In [None]:
# Function to clean HTML tags and entities
def clean_html(text):
    return re.sub(r'<.*?>', '', text)  # Remove HTML tags


In [None]:
# Apply clean_html using vectorized operation
questions_tags_df['Text'] = questions_tags_df['Text'].apply(clean_html).apply(unescape)


In [None]:
# Function to process text
def process_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize, remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in word_tokenize(text.lower()) if token.isalpha() and token not in stop_words]
    return ' '.join(tokens)


In [None]:
# Parallelize the text processing
def parallelize_dataframe(df, func, num_cores=4):
    df_split = np.array_split(df, num_cores)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


In [None]:
# Apply process_text in parallel
questions_tags_df['Cleaned_Text'] = parallelize_dataframe(questions_tags_df['Text'], process_text)


In [None]:
# Handle missing values in Tags (if any)
questions_tags_df.dropna(subset=['Tags'], inplace=True)


In [None]:
# Convert Tags column to list of strings
questions_tags_df['Tags'] = questions_tags_df['Tags'].apply(lambda x: x.split())


In [None]:
# MultiLabelBinarizer for Tags
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(questions_tags_df['Tags'])


In [None]:
# Prepare features (X) and labels (y)
X = questions_tags_df['Cleaned_Text']


In [None]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000, tokenizer=lambda x: x.split(), ngram_range=(1, 2))


In [None]:
# Fit and transform on training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)


In [None]:
# Transform validation data
X_val_tfidf = tfidf_vectorizer.transform(X_val)


In [None]:
# Save the processed data and vectorizer
questions_tags_df.to_csv('processed_data.csv', index=False)
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(mlb, 'label_binarizer.pkl')


In [None]:
# Sample model training code
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, f1_score, hamming_loss


In [None]:
# Initialize and train the model
clf = MultiOutputClassifier(LogisticRegression(max_iter=1000))
clf.fit(X_train_tfidf, y_train)


In [None]:
# Validate the model
y_pred = clf.predict(X_val_tfidf)
print("Accuracy: ", accuracy_score(y_val, y_pred))
print("F1 Score: ", f1_score(y_val, y_pred, average='weighted'))
print("Hamming Loss: ", hamming_loss(y_val, y_pred))
