In [7]:
import os
import pandas as pd
from glob import glob
import re
import json

# Constants
PROJECT_PATH = './'
DATA_PATH = os.path.join(PROJECT_PATH, 'output')
SPECIAL_CHARS = {'¨', 'ï', '¸', '?', '$'}
TURKISH_LETTERS = set('ıüğüşçö')

# Utility functions
def starts_with_two_capitals(text):
    return text[:2].isupper()

def count_periods(text):
    return text.count('.')

def contains_turkish_letters(text):
    return any(letter in text for letter in TURKISH_LETTERS)

def contains_any_special_char(text, chars):
    return any(char in text for char in chars)

def starts_with_capital(text):
    return text[0].isupper()

def ends_with_dot(text):
    return text.endswith('.')

def contains_single_letter_word(text):
    return any(len(word) == 1 and word.isalpha() for word in text.split())

def length_difference_within_limit(text1, text2, limit=0.1):
    len1 = len(text1)
    len2 = len(text2)
    return abs(len1 - len2) / max(len1, len2) <= limit

def transform_dashes(text):
    if isinstance(text, str):
        text = re.sub(r'([a-zA-Z])-([a-zA-Z])', r'\1\1', text)
        text = re.sub(r'([a-zA-Z]{2})- ([a-zA-Z]{2})', r'\1\2', text)
    return text

# Merge all CSV files
def merge_csv_files(data_path):
    csv_files = glob(os.path.join(data_path, "*.csv"))
    return pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)

merged_df = merge_csv_files(DATA_PATH)
print(merged_df.info())

# Ensure 'tr' and 'en' columns are of string type before applying transformations
merged_df['tr'] = merged_df['tr'].astype(str)
merged_df['en'] = merged_df['en'].astype(str)

# Apply transformation to replace patterns like a-a with aa and aa- aa with aa
merged_df['tr'] = merged_df['tr'].apply(transform_dashes)
merged_df['en'] = merged_df['en'].apply(transform_dashes)

# Data cleaning and filtering
filtered_data_final = (
    merged_df.dropna()
    .drop_duplicates(subset=['tr', 'en'])
    .loc[
        (~merged_df['en'].apply(starts_with_two_capitals)) &
        (~merged_df['tr'].apply(starts_with_two_capitals)) &
        (merged_df['tr'].apply(count_periods) == merged_df['en'].apply(count_periods)) &
        (merged_df['tr'].apply(starts_with_capital)) &
        (merged_df['en'].apply(starts_with_capital)) &
        (merged_df['tr'].apply(contains_turkish_letters)) &
        (~merged_df['tr'].apply(contains_any_special_char, chars=SPECIAL_CHARS)) &
        (~merged_df['en'].apply(contains_any_special_char, chars=SPECIAL_CHARS)) &
        (merged_df['tr'].apply(ends_with_dot)) &
        (merged_df['en'].apply(ends_with_dot)) &
        (~merged_df['tr'].apply(contains_single_letter_word)) &
        (merged_df.apply(lambda row: length_difference_within_limit(row['tr'], row['en']), axis=1))
    ]
)

# add n_characters, n_words, n_sentences columns for en
filtered_data_final['n_characters_en'] = filtered_data_final['en'].apply(len)
filtered_data_final['n_words_en'] = filtered_data_final['en'].apply(lambda x: len(x.split()))
filtered_data_final['n_sentences_en'] = filtered_data_final['en'].apply(lambda x: len(re.split(r'[.!?]', x)))

print(filtered_data_final.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23676 entries, 0 to 23675
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   university  23676 non-null  object
 1   konu        23676 non-null  object
 2   tr          23119 non-null  object
 3   en          23235 non-null  object
dtypes: object(4)
memory usage: 740.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 3632 entries, 5 to 23669
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   university       3632 non-null   object
 1   konu             3632 non-null   object
 2   tr               3632 non-null   object
 3   en               3632 non-null   object
 4   n_characters_en  3632 non-null   int64 
 5   n_words_en       3632 non-null   int64 
 6   n_sentences_en   3632 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 227.0+ KB
None


In [8]:
# use abbreviations for universities
filtered_data_final['university'] = filtered_data_final['university'].replace(
    {'Orta Doğu Teknik Üniversitesi': 'ODTÜ', 
     'Boğaziçi Üniversitesi': 'BOUN', 
     'İstanbul Teknik Üniversitesi': 'İTÜ', 
     'İhsan Doğramacı Bilkent Üniversitesi': 'BİLKENT', 
     'Koç Üniversitesi': 'KOÇ', 
     'Sabancı Üniversitesi': 'SABANCI'})


# use abbreviations for departments
filtered_data_final['konu'] = filtered_data_final['konu'].replace(
    {'Elektrik ve Elektronik Mühendisliği = Electrical and Electronics Engineering': 'ELEC',
     'Bilgisayar Mühendisliği Bilimleri-Bilgisayar ve Kontrol = Computer Engineering and Computer Science and Control': 'COMP',
     'Makine Mühendisliği = Mechanical Engineering': 'MECH',
     'Endüstri ve Endüstri Mühendisliği = Industrial and Industrial Engineering': 'INDR',
     'Fizik ve Fizik Mühendisliği = Physics and Physics Engineering': 'PHYS',
     'Matematik = Mathematics': 'MATH'})


In [9]:
# create paper_id column as a unique identifier for each row from 1 to n
filtered_data_final['paper_id'] = range(1, len(filtered_data_final) + 1)

# make paper_id the first column
filtered_data_final = filtered_data_final[['paper_id', 'university', 'konu', 'tr', 'en', 'n_characters_en', 'n_words_en', 'n_sentences_en']]




json_data = []
id_counter = 1

for index, row in filtered_data_final.iterrows():
    json_data.append({
        "id": id_counter,
        "paper_id": row["paper_id"],
        "university": row["university"],
        "konu": row["konu"],
        "data": {
            "my_text": f"ENGLISH: {row['en']} \n \n TURKISH: {row['tr']}"
        }
        })
    id_counter += 1

# Save the JSON structure to a file
json_file_path = 'label_studio/yok_all.json'
with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(json_data, json_file, ensure_ascii=False, indent=4)

In [10]:
# Save the filtered data
filtered_data_final.to_csv(os.path.join(PROJECT_PATH+'/cleaned_output/', 'cleaned_data.csv'), index=False)

# save as excel
filtered_data_final.to_excel(os.path.join(PROJECT_PATH+'/cleaned_output/', 'cleaned_data.xlsx'), index=False, engine='xlsxwriter')

In [11]:
# sample data 43 rows 
sample_data = filtered_data_final.sample(n=43, random_state=1)

# Save the sample data as JSON
sample_json_data = []
id_counter = 1

for index, row in sample_data.iterrows():
    sample_json_data.append({
        "id": id_counter,
        "paper_id": row["paper_id"],
        "university": row["university"],
        "konu": row["konu"],
        "data": {
            "my_text": f"ENGLISH: {row['en']} \n \n TURKISH: {row['tr']}"
        }
        })
    id_counter += 1
    
sample_json_file_path = 'label_studio/yok_sample.json'
with open(sample_json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(sample_json_data, json_file, ensure_ascii=False, indent=4)

In [12]:
# Save the sample data
sample_data.to_csv(os.path.join(PROJECT_PATH+'/cleaned_output/', 'sample_data.csv'), index=False)

# save as excel
sample_data.to_excel(os.path.join(PROJECT_PATH+'/cleaned_output/', 'sample_data.xlsx'), index=False, engine='xlsxwriter')