In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/test-data/test_data (3).csv
/kaggle/input/train-data/train_data (2).csv
/kaggle/input/sample-output/sample_output(14).csv


In [2]:
import re

In [3]:
train = pd.read_csv('/kaggle/input/train-data/train_data (2).csv')
test = pd.read_csv('/kaggle/input/test-data/test_data (3).csv')

# SUBTASK 1 : HUMAN OR AI

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if not word in stopwords]
    return ' '.join(words)

def preprocess_df(df, text_col = 'text', mode = 'train', vectorizer = TfidfVectorizer()):
    df[text_col] = df[text_col].apply(clean_text)
    if mode == 'train':
        vectors = vectorizer.fit_transform(df[text_col])
    else:
        vectors = vectorizer.transform(df[text_col])


    return vectors, vectorizer

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
x_train, tfidf = preprocess_df(train)

In [6]:
y_train = train['label']

In [7]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)

In [8]:
model.fit(x_train, y_train)

In [9]:
x_test = test[test['subtaskID'] == 1]
x_test, tfidf = preprocess_df(x_test, mode = 'test', vectorizer = tfidf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_col] = df[text_col].apply(clean_text)


In [10]:
y_pred = model.predict(x_test)

In [11]:


submission_1 = pd.DataFrame({
    'subtaskID' : [1] * len(y_pred),
    'datapointID' : test.head(test['subtaskID'].value_counts()[1])['ID'],
    'answer' : y_pred
})

# SUBTASK 2: TEXT THEME CLASSIFICATION

In [12]:
len(test)

5563

In [13]:
s2_test = test[test['subtaskID'] == 2]
s2_df = test[test['subtaskID'] == 2]

s2_test, tfidf = preprocess_df(s2_test, vectorizer = tfidf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_col] = df[text_col].apply(clean_text)


In [14]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 4, random_state = 42)

kmeans.fit(s2_test)

labels = kmeans.labels_
s2_df['label'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s2_df['label'] = labels


In [15]:
def get_keywords(tfidf_vectorizer, kmeans_model, n_terms = 10):
    centroids = kmeans_model.cluster_centers_
    terms = tfidf_vectorizer.get_feature_names_out()

    for i, centroid in enumerate(centroids):
        print(f'Top words of cluster {i}:')
        top_terms_idx = centroid.argsort()[-n_terms:][::-1]
        top_terms = [terms[ind] for ind in top_terms_idx]
        print(', '.join(top_terms))

In [16]:
get_keywords(tfidf, kmeans)

Top words of cluster 0:
od, religious, church, esus, people, white, hristian, us, faith, said
Top words of cluster 1:
science, scientists, arth, ars, pace, said, climate, space, moon, research
Top words of cluster 2:
company, said, percent, ber, business, women, companies, mazon, employees, workers
Top words of cluster 3:
said, police, shooting, olice, told, school, according, ounty, osby, suspect


In [17]:
cluster_names = { 
    0 : 'RELIGION', 
    1 : 'SCIENCE', #Religion
    2 : 'BUSINESS', #crime
    3 : 'CRIME' #business
}

s2_df['label'] = s2_df['label'].map(cluster_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s2_df['label'] = s2_df['label'].map(cluster_names)


In [18]:
submission_2 = pd.DataFrame({
    'subtaskID' : [2] * len(s2_df),
    'datapointID' : s2_df['ID'],
    'answer' : s2_df['label']
})

In [19]:
submission = pd.concat([submission_1, submission_2], axis = 0)

In [20]:
submission

Unnamed: 0,subtaskID,datapointID,answer
0,1,10001,1.0
1,1,10002,0.0
2,1,10003,1.0
3,1,10004,0.0
4,1,10005,1.0
...,...,...,...
5558,2,15796,CRIME
5559,2,15797,RELIGION
5560,2,15798,SCIENCE
5561,2,15799,BUSINESS


In [21]:
submission.to_csv('submission.csv', index = False)