In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk
import spacy

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from wordcloud import WordCloud
from logging import getLogger

pd.options.mode.chained_assignment = None

In [5]:
train = pd.read_csv("/kaggle/input/gods-4-0-dataset/train (8).csv")
test = pd.read_csv("/kaggle/input/gods-4-0-dataset/test (6).csv")
ss = pd.read_csv("/kaggle/input/gods-4-0-dataset/SampleSubmission (13).csv")

In [6]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [20]:
train["input"] = train["title"] + " "  + train["content"]
test["input"] = test["title"] + " " + test["content"]

In [13]:
################################ TO REMOVE PUNCTUATION

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

from nltk.corpus import stopwords
", ".join(stopwords.words('english'))


################################ TO REMOVE STOPWORDS

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])


################################ TO REMOVE MOST FREQUENT WORDS

def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])


################################ APPLYING STEMMING


stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
        

In [14]:
train['input']=train['input'].apply(lambda x: remove_punctuation(x))
train['input']=train['input'].apply(lambda x: remove_stopwords(x))
train['input']=train['input'].apply(lambda x: stem_words(x))

test['input']=test['input'].apply(lambda x: remove_punctuation(x))
test['input']=test['input'].apply(lambda x: remove_stopwords(x))
test['input']=test['input'].apply(lambda x: stem_words(x))


In [27]:
train_text = train['input']
test_text = test['input']
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=10000)
word_vectorizer.fit(all_text)

In [32]:

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(all_text)




In [42]:
try:
    train = train.reset_index(drop=True)
except:
    pass

In [43]:
def make_folds(train_df, n_splits):

    train_df["fold"] = -1
    X = train_df["content"]
    y = train_df["target"]
    skf = StratifiedKFold(n_splits=5)

    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        train_df.loc[val_index, "fold"] = i
    return train_df

In [46]:
train = make_folds(train, 5) #Stratification based on the target values to make sure all classes are present

# Training Phase

In [48]:
import joblib
all_preds = []
all_gt = []
for fold in range(5):

    print(f'--------------------------------Training Fold {fold+1}/5---------------------------------')

    train_ = train[train.fold != fold]
    valid_ = train[train.fold == fold]

    print(f'train shape : {len(train_)}')
    print(f'valid shape : {len(valid_)}')

    train_word_features = word_vectorizer.transform(train_["content"])
    test_word_features = word_vectorizer.transform(valid_["content"])

    train_char_features = char_vectorizer.transform(train_["content"])
    test_char_features = char_vectorizer.transform(valid_["content"])

    train_features = hstack([train_char_features, train_word_features])
    test_features = hstack([test_char_features, test_word_features])

    classifier = LogisticRegression(C=2, solver='sag')

    classifier.fit(train_features, train_["target"])

    preds = classifier.predict(test_features)
    print(accuracy_score(valid_['target'].to_list(), preds))

    all_gt.append(valid_['target'].to_list())
    all_preds.append(preds)

    joblib.dump(classifier, f'logistic_regression_fold_{fold}.pkl')


--------------------------------Training Fold 1/5---------------------------------
train shape : 17527
valid shape : 4382
0.7507987220447284
--------------------------------Training Fold 2/5---------------------------------
train shape : 17527
valid shape : 4382
0.7562756732085806
--------------------------------Training Fold 3/5---------------------------------
train shape : 17527
valid shape : 4382
0.7453217708808764
--------------------------------Training Fold 4/5---------------------------------
train shape : 17527
valid shape : 4382
0.7590141487905067
--------------------------------Training Fold 5/5---------------------------------
train shape : 17528
valid shape : 4381
0.7404702122803013


# Inference Phase

In [23]:
test = pd.read_csv("/kaggle/input/gods-4-0-dataset/test (6).csv")
test["content"] = test["content"].fillna(" ")

In [24]:
import joblib
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score

final_test_preds = []
for fold in range(5):

    print(f'------------------------------ Validating Fold {fold+1}/5 ------------------------------')


    
    test_word_features = word_vectorizer.transform(test["content"])
    test_char_features = char_vectorizer.transform(test["content"])
    test_features = hstack([test_char_features, test_word_features])

    model_path = f'/kaggle/input/old-school-nlp/logistic_regression_fold_{fold}.pkl'
    classifier = joblib.load(model_path)

    preds = classifier.predict(test_features)
    preds_proba = classifier.predict_proba(test_features)

    final_test_preds.append(preds_proba)


------------------------------ Validating Fold 1/5 ------------------------------
------------------------------ Validating Fold 2/5 ------------------------------
------------------------------ Validating Fold 3/5 ------------------------------
------------------------------ Validating Fold 4/5 ------------------------------
------------------------------ Validating Fold 5/5 ------------------------------


In [25]:
final_preds = np.mean(final_test_preds, 0) #average of 5 folds

In [26]:
for i in range(5):
    test[f'deb_base_preds_class_{i}'] = final_preds[:, i]
test = test[[col for col in test.columns if col not in ["title", "content"]]]

In [28]:
test.to_csv("os_test.csv", index=False) 