In [1]:
# Importing Libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')

In [3]:
# Cleaining the text and removing special characters
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]','',text)
    return text

In [4]:
# Applying the cleaned text
train_df['body'] = train_df['body'].apply(clean_text)
test_df['body'] = test_df['body'].apply(clean_text)

In [5]:
# Create a URL feature
train_df['has_url'] = train_df['body'].str.contains('http').astype(int)
test_df['has_url'] = train_df['body'].str.contains('http').astype(int)

In [6]:
# Creating features and target values (i.e X and y)
X = train_df[['body', 'rule', 'subreddit', 'has_url']]
y = train_df['rule_violation']
X_test = [['body', 'rule', 'subreddit', 'has_url']]

In [7]:
# Spliting the data (using val not test)
X_train, X_val, y_train,y_val = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [8]:
# Encoding
preprocessor = ColumnTransformer(
    transformers = [
        ('text', TfidfVectorizer(),'body'),
        ('cat', OneHotEncoder(handle_unknown='ignore'),['subreddit', 'rule']),
        ('num', 'passthrough',['has_url'])
    ])

In [9]:
# Initializing the model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

model.fit(X_train,y_train)

In [10]:
# Predicting 
y_pred = model.predict(X_val)

In [11]:
#Evaluating the model
print(f'F1 Score: {f1_score(y_val,y_pred)*100:.3f}%')
print(f'ROC-AUC: {roc_auc_score(y_val, model.predict_proba(X_val)[:,1])*100:.3f}%')

F1 Score: 74.260%
ROC-AUC: 81.017%


In [12]:
# Cross Validation
cv_scores = cross_val_score(model, X,y,cv=5,scoring ='f1')
print(f'Cross-Validation F1 Scores: {cv_scores.mean():.3f}, {cv_scores.std():.3f}')

Cross-Validation F1 Scores: 0.721, 0.016


In [13]:
# Predicting on test data and saving to csv
X_test = test_df[['body', 'subreddit', 'rule', 'has_url']]


test_predictions = model.predict(X_test)
submission = pd.DataFrame({'row_id':test_df['row_id'],'rule_violation':test_predictions})
submission.to_csv('submission.csv', index = False)