In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re

In [None]:
train = pd.read_csv('/kaggle/input/train-data/train_data (2).csv')
test = pd.read_csv('/kaggle/input/test-data/test_data (3).csv')

# SUBTASK 1 : HUMAN OR AI

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if not word in stopwords]
    return ' '.join(words)

def preprocess_df(df, text_col = 'text', mode = 'train', vectorizer = TfidfVectorizer()):
    df[text_col] = df[text_col].apply(clean_text)
    if mode == 'train':
        vectors = vectorizer.fit_transform(df[text_col])
    else:
        vectors = vectorizer.transform(df[text_col])


    return vectors, vectorizer

In [None]:
x_train, tfidf = preprocess_df(train)

In [None]:
y_train = train['label']

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)

In [None]:
model.fit(x_train, y_train)

In [None]:
x_test = test[test['subtaskID'] == 1]
x_test, tfidf = preprocess_df(x_test, mode = 'test', vectorizer = tfidf)

In [None]:
y_pred = model.predict(x_test)

In [None]:


submission_1 = pd.DataFrame({
    'subtaskID' : [1] * len(y_pred),
    'datapointID' : test.head(test['subtaskID'].value_counts()[1])['ID'],
    'answer' : y_pred
})

# SUBTASK 2: TEXT THEME CLASSIFICATION

In [None]:
len(test)

In [None]:
s2_test = test[test['subtaskID'] == 2]
s2_df = test[test['subtaskID'] == 2]

s2_test, tfidf = preprocess_df(s2_test, vectorizer = tfidf)

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 4, random_state = 42)

kmeans.fit(s2_test)

labels = kmeans.labels_
s2_df['label'] = labels

In [None]:
def get_keywords(tfidf_vectorizer, kmeans_model, n_terms = 10):
    centroids = kmeans_model.cluster_centers_
    terms = tfidf_vectorizer.get_feature_names_out()

    for i, centroid in enumerate(centroids):
        print(f'Top words of cluster {i}:')
        top_terms_idx = centroid.argsort()[-n_terms:][::-1]
        top_terms = [terms[ind] for ind in top_terms_idx]
        print(', '.join(top_terms))

In [None]:
get_keywords(tfidf, kmeans)

In [None]:
cluster_names = { 
    0 : 'RELIGION', 
    1 : 'SCIENCE', #Religion
    2 : 'BUSINESS', #crime
    3 : 'CRIME' #business
}

s2_df['label'] = s2_df['label'].map(cluster_names)

In [None]:
submission_2 = pd.DataFrame({
    'subtaskID' : [2] * len(s2_df),
    'datapointID' : s2_df['ID'],
    'answer' : s2_df['label']
})

In [None]:
submission = pd.concat([submission_1, submission_2], axis = 0)

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index = False)