In [30]:
import pandas as pd

dataframe = pd.read_json('training-dataset.jsonl', lines=True)


In [31]:
df = pd.json_normalize(dataframe['profile']) #creates dataframe for the profile
df_posts_ = pd.json_normalize(dataframe['posts']) #creates dataframe for the posts

In [32]:
#unpacks the objects in df_posts_
def unpack_nested_column(df, column_name):
    unpacked_df = pd.json_normalize(df[column_name])
    unpacked_df.columns = [f"{column_name}_{subcol}" for subcol in unpacked_df.columns]
    return unpacked_df

# Iterate over all columns, unpack nested ones, and store in a list
unpacked_dfs = []
for col in df_posts_.columns:
    if isinstance(df_posts_[col].iloc[0], dict):
        unpacked_df = unpack_nested_column(df_posts_, col)
        df_posts_ = df_posts_.drop(columns=[col]).join(unpacked_df)
    else:
        unpacked_dfs.append(df_posts_[[col]])

# Concatenate the remaining DataFrames (non-nested columns)
if unpacked_dfs:
    non_nested_df = pd.concat(unpacked_dfs, axis=1)
    result_df = pd.concat([non_nested_df, df_posts_], axis=1)
else:
    result_df = df_posts_

df_posts_ = result_df

In [33]:
df_posts_ = df_posts_.drop(columns=[col for col in df_posts_.columns if 'media_url' in col])
df_posts_ = df_posts_.drop(columns=[col for col in df_posts_.columns if 'id' in col])
df_posts_ = df_posts_.drop(columns=[col for col in df_posts_.columns if 'timestamp' in col])
df_posts_ = df_posts_.drop(columns=[col for col in df_posts_.columns if 'comments_count' in col])
df_posts_ = df_posts_.drop(columns=[col for col in df_posts_.columns if 'like_count' in col])
df_posts_ = df_posts_.drop(columns=[col for col in df_posts_.columns if 'media_type' in col])

# Combine all caption columns into one
df_posts_['combined'] = df_posts_.filter(like='caption').apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

# Drop the original caption columns
df_posts_ = df_posts_.drop(columns=[col for col in df_posts_.columns if '_caption' in col])

In [34]:
import numpy as np

df = df.drop('is_private', axis=1)
df = df.drop('profile_pic_url', axis=1)
df = df.drop('profile_picture_base64', axis=1)
df = df.drop('business_phone_number', axis=1)
df = df.drop('eimu_id', axis=1)
df = df.drop('fbid', axis=1)
df = df.drop('fb_profile_biolink', axis=1)
df = df.drop('id', axis=1)
df = df.drop('is_professional_account', axis=1)
df = df.drop('ai_agent_type', axis=1)
df = df.drop('restricted_by_viewer', axis=1)
df = df.drop('business_email', axis=1)
df = df.drop('is_regulated_c18', axis=1)
df = df.drop('entities', axis=1)
df = df.drop('overall_category_name', axis=1)
df = df.drop('post_count', axis=1)
df = df.drop('bio_links', axis=1)
df = df.drop('business_address_json', axis=1)
df = df.drop('category_enum', axis=1)
df.drop(['should_show_category', 'should_show_public_contacts' ] , axis=1, inplace=True)
df.drop(['hide_like_and_view_counts','highlight_reel_count', 'business_contact_method'], axis=1, inplace=True)

uniform_columns = []

for column in df.columns:
    if df[column].nunique() == 1:
        uniform_columns.append(column)
for column in uniform_columns:
    df = df.drop(column, axis=1)


In [35]:
# Create a new boolean column based on 'external_url'
df['external_url_bool'] = df['external_url'].apply(lambda x: x is not None)

# Drop the original 'external_url' column
df = df.drop('external_url', axis=1)


In [36]:
# List of column names to concatenate
columns_to_combine = ['full_name','biography', 'category_name', 'business_category_name']  # Replace with your actual column names

# Concatenate the string columns into a new column 'combined_text'
df['combined_text'] = df[columns_to_combine].fillna('').agg(' '.join, axis=1)
df = df.drop('full_name', axis=1)
df = df.drop('biography', axis=1)
df = df.drop('category_name', axis=1)
df = df.drop('business_category_name', axis=1)

In [37]:
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
# Download Turkish stopwords
nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')
english_stopwords = stopwords.words('english')
for i in range(len(english_stopwords)):
    english_stopwords[i] = english_stopwords[i].lower()
for i in range(len(turkish_stopwords)):
    turkish_stopwords[i] = turkish_stopwords[i].casefold()

stopwords = turkish_stopwords + english_stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
#Creating columns using TF-IDF for most used 8500 words in combined text in profile dataframe


# Define your custom vocabulary
custom_vocab = ['Politician', 'Public & Government Service', 'Şehir', 'Government organization' , 'Belediye', 'Başkanlığı', 'tr', 'Government organization']  # Replace with your custom words

# Define the preprocessing function
def preprocess_text(text: str):
    # Lowercase Turkish text using casefold
    text = text.casefold()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text
# Apply preprocessing to the 'combined_text' column
df['combined_text'] = df['combined_text'].apply(preprocess_text)

# Initialize TfidfVectorizer with Turkish stopwords and max features
vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=8500)

# Fit and transform the 'combined_text' column
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# Get feature names (the dynamically selected top words)
dynamic_features = vectorizer.get_feature_names_out()

# Combine dynamic features with custom vocabulary, ensuring no duplicates
final_vocab = list(set(dynamic_features).union(custom_vocab))

# Reinitialize TfidfVectorizer with the combined vocabulary
final_vectorizer = TfidfVectorizer(vocabulary=final_vocab)

# Transform the text again with the combined vocabulary
final_tfidf_matrix = final_vectorizer.fit_transform(df['combined_text'])

# Convert to DataFrame for readability
tfidf_df = pd.DataFrame(final_tfidf_matrix.toarray(), columns=final_vectorizer.get_feature_names_out())

# Concatenate the original DataFrame with the TF-IDF DataFrame
df = pd.concat([df, tfidf_df], axis=1)




In [39]:
#Creating columns using TF-IDF for most used 8500 words in combined text in posts dataframe


# Define the preprocessing function
def preprocess_text(text: str):
    # Lowercase Turkish text using casefold
    text = text.casefold()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text
# Apply preprocessing to the 'combined_text' column
df_posts_['combined'] = df_posts_['combined'].apply(preprocess_text)

# Initialize TfidfVectorizer with Turkish stopwords and max features
vectorizer_posts = TfidfVectorizer(stop_words=stopwords, max_features=8500)

# Fit and transform the 'combined_text' column
tfidf_matrix_posts = vectorizer_posts.fit_transform(df_posts_['combined'])

# Get feature names (the dynamically selected top words)
dynamic_features_posts = vectorizer_posts.get_feature_names_out()

# Combine dynamic features with custom vocabulary, ensuring no duplicates
final_vocab_posts = list(set(dynamic_features_posts))

# Reinitialize TfidfVectorizer with the combined vocabulary
final_vectorizer_posts = TfidfVectorizer(vocabulary=final_vocab_posts)

# Transform the text again with the combined vocabulary
final_tfidf_matrix_posts = final_vectorizer_posts.fit_transform(df_posts_['combined'])

# Convert to DataFrame for readability
tfidf_df_posts = pd.DataFrame(final_tfidf_matrix_posts.toarray(), columns=final_vectorizer_posts.get_feature_names_out())

# Concatenate the original DataFrame with the TF-IDF DataFrame
df = pd.concat([df, tfidf_df_posts], axis=1)


In [40]:
#Adding training data

train_classification_df = pd.read_csv("train-classification.csv",)
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

# Unifying labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

In [41]:
# Adding training data
# Add a row at the beginning of the dataframe to avoid detecting a feature as a column name
test_classification_df = pd.read_csv("test-classification-round3.dat",header=None)
test_classification_df.columns = ['user_id']
# Unifying labels
username2_category_test = test_classification_df['user_id'].apply(str.lower).str.strip().tolist()
username2_category_test = [uname.lower().strip() for uname in username2_category_test]

In [42]:
# Create a boolean mask for train and test sets
is_train = df['username'].isin(username2_category.keys())
is_test = df['username'].apply(str.lower).str.strip().isin(username2_category_test)

In [43]:
# Split the dataframe into train and test sets
train_df = df[is_train].reset_index(drop=True)
test_df = df[is_test].reset_index(drop=True)

# Check if all usernames in username2_category_test are in test_df
for i in username2_category_test:
    if i not in test_df['username'].values:
        print(i)
        username2_category_test.remove(i)

# Display the shapes of the resulting dataframes
print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

Train set shape: (2741, 17016)
Test set shape: (1000, 17016)


In [44]:
train_usernames = train_df['username']

y = [username2_category.get(uname, "NA") for uname in train_usernames]

X = train_df.drop(['username','combined_text'], axis=1)

X_Test = test_df.drop(['username','combined_text'], axis=1)

X.columns = X.columns.astype(str)
X_Test.columns = X_Test.columns.astype(str)

In [45]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE                                            
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# Convert non-numeric features to numeric
X = pd.get_dummies(X, drop_first=True)
X = X.fillna(0)

X_Test = pd.get_dummies(X_Test, drop_first=True)
X_Test = X_Test.fillna(0)


# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Balance classes using SMOTE
smote = BorderlineSMOTE(random_state=42, k_neighbors=1, sampling_strategy='not minority', m_neighbors=11, kind='borderline-1')
X_resampled, y_resampled= smote.fit_resample(X, y_encoded)

# Scale features for better optimization
scaler = StandardScaler()
X_resampled = scaler.fit_transform(X_resampled)
X_Test = scaler.fit_transform(X_Test)

# Train SGDClassifier
clf = SGDClassifier(random_state=42, max_iter=3000, tol=1e-3, class_weight='balanced', loss='log_loss', n_jobs=-1)
clf.fit(X_resampled, y_resampled) #used full training data

# Evaluate the model
y_pred_deniz_test = clf.predict(X_Test)
y_pred_decoded_test = label_encoder.inverse_transform(y_pred_deniz_test)






In [46]:
import pandas as pd
import json


# Create a dictionary with usernames as keys and predicted labels as values
username_label_dict = dict(zip(test_df['username'], y_pred_decoded_test))

# Convert the dictionary to a JSON object
json_object = json.dumps(username_label_dict, indent=4)

predictions_dict = username_label_dict

# Convert the dictionary into a DataFrame for easier processing
predictions = pd.DataFrame(list(predictions_dict.items()), columns=["username", "category"])

# Count initial values for potential changes(belediye)
initial_count = predictions.loc[
    predictions['username'].str.contains('belediye', case=False, na=False) &
    (predictions['category'] != 'Health and Lifestyle')
].shape[0]

# Update the category for usernames containing 'belediye'
predictions.loc[predictions['username'].str.contains('belediye', case=False, na=False), 'category'] = 'Health and Lifestyle'

# Count initial values for potential changes (bld)
initial_count_1 = predictions.loc[
    predictions['username'].str.contains('bld', case=False, na=False) &
    (predictions['category'] != 'Health and Lifestyle')
].shape[0]

# Update the category for usernames containing 'bld'
predictions.loc[predictions['username'].str.contains('bld', case=False, na=False), 'category'] = 'Health and Lifestyle'

# Count initial values for potential changes(bel.tr)
initial_count_2 = predictions.loc[
    predictions['username'].str.contains('bel.tr', case=False, na=False) &
    (predictions['category'] != 'Health and Lifestyle')
].shape[0]

# Update the category for usernames containing 'bel.tr'
predictions.loc[predictions['username'].str.contains('bel.tr', case=False, na=False), 'category'] = 'Health and Lifestyle'

# Update the category for usernames containing 'bel'
predictions.loc[predictions['username'].str.endswith('bel'), 'category'] = 'Health and Lifestyle'

predictions['category'] = predictions['category'].str.title()

# Replace 'And' with 'and' in the 'category' column
predictions['category'] = predictions['category'].str.replace('And', 'and')

# Convert the DataFrame back into a dictionary
updated_predictions_dict = dict(zip(predictions['username'], predictions['category']))

# Save the updated dictionary to a JSON file
with open("predictionround3.json", "w") as f:
    json.dump(updated_predictions_dict, f, indent=4)

# Print the count of changes
print(f"Number of categories changed to 'Health and Lifestyle': {initial_count+initial_count_1+initial_count_2}")

Number of categories changed to 'Health and Lifestyle': 41


In [None]:
# Final reordering, I believe this may be unnecessary. It's here just in case.
import json

# Read the .dat file to get the list of usernames
with open('test-classification-round3.dat', 'r') as file:
    usernames = [line.strip() for line in file]

# Read the JSON file to get the current mapping of usernames to predictions
with open('predictionround3.json', 'r') as file:
    predictions = json.load(file)

# Create a new ordered dictionary
ordered_predictions = {username: predictions[username] for username in usernames if username in predictions}

# Check for any missing usernames or predictions
missing_from_dat = set(predictions.keys()) - set(usernames)
missing_from_json = set(usernames) - set(predictions.keys())

if missing_from_dat:
    print("Warning: The following usernames are in the JSON file but not in the .dat file:")
    print(missing_from_dat)

if missing_from_json:
    print("Warning: The following usernames are in the .dat file but not in the JSON file:")
    print(missing_from_json)

# Write the new ordered JSON object back to a file
with open('prediction-classification-round3.json', 'w') as file:
    json.dump(ordered_predictions, file, indent=4)

print("Reordering complete. Please check the warnings above, if any.")
