# Multi Output Classifier with Logistic Regression

In [4]:
import pandas as pd
main_df = pd.read_csv('data/main.csv')  
main_df = main_df.dropna()
#drop rows with label = 1
main_df = main_df[main_df.label != 1]
main_df.head()

save_path = 'data/main_no_1.csv'
main_df.to_csv(save_path, index=False)

In [3]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Step 1: Merge the CSV Files
# Load the CSV files (adjust file names and paths as necessary)
tweets_df = pd.read_csv('data/tweet_labelled_cleaned_topics.csv')  # Contains 'tweet_text' and 'topic_number'
tags_df = pd.read_csv('data/labels.csv')  

# Merge on 'topic_number'
merged_df = pd.merge(tweets_df, tags_df, on='tweet_topics', how='left')

# add main_df to merged_df 

merged_df = pd.merge(merged_df, main_df, on='tweet', how='left')

# Step 2: Prepare the Tags for One-Hot Encoding
merged_df['tags'] = merged_df['tags'].apply(lambda x: x.lower().split(','))

# Step 3: One-Hot Encode the Tags
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(merged_df['tags'])

# Step 4: Prepare the Features
# Use TF-IDF Vectorization on 'tweet_text'
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features
X = vectorizer.fit_transform(merged_df['tweet'])

# Step 5: Train a Multi-Label Classification Model
# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# Use MultiOutputClassifier with Logistic Regression
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, Y_train)

# Step 6: Evaluate the Model
Y_pred = model.predict(X_test)
print(classification_report(Y_test, Y_pred, target_names=mlb.classes_))

# Step 7: Predict Tags for New Tweets
new_tweets = ['This is a new tweet about topic X', 'Another tweet about topic Y']
X_new = vectorizer.transform(new_tweets)
Y_new_pred = model.predict(X_new)
tags_predicted = mlb.inverse_transform(Y_new_pred)
for tweet, tags in zip(new_tweets, tags_predicted):
    print(f"Tweet: {tweet}\nPredicted Tags: {tags}\n")


KeyboardInterrupt: 

In [16]:
# save the model
import joblib
joblib.dump(model, 'model.pkl')


['model.pkl']

In [27]:
# use the model to predict if a tweet is about a topic
model = joblib.load('model.pkl')
new_tweets = ['the government is doing a great job', 'the government is doing a terrible job']
X_new = vectorizer.transform(new_tweets)
Y_new_pred = model.predict(X_new)
tags_predicted = mlb.inverse_transform(Y_new_pred)
for tweet, tags in zip(new_tweets, tags_predicted):
    print(f"Tweet: {tweet}\nPredicted Tags: {tags}\n")

Tweet: the government is doing a great job
Predicted Tags: ('political_polarisation', 'vulgarity')

Tweet: the government is doing a terrible job
Predicted Tags: ('political_polarisation', 'vulgarity')

