1. Importing Libraries

In [19]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import json
from cities import Cities, replace_turkish_letters
import re
from thefuzz import process
#Importing pure data.
df = pd.read_csv('train.csv', low_memory=False)
lise_frame = pd.read_csv('lise_en.csv', low_memory=False)


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65125 entries, 0 to 65124
Data columns (total 44 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Basvuru Yili                                               65125 non-null  int64  
 1   Degerlendirme Puani                                        65124 non-null  float64
 2   Cinsiyet                                                   64956 non-null  object 
 3   Dogum Tarihi                                               64948 non-null  object 
 4   Dogum Yeri                                                 64334 non-null  object 
 5   Ikametgah Sehri                                            63088 non-null  object 
 6   Universite Adi                                             64993 non-null  object 
 7   Universite Turu                                            64870 non-null  object 
 8   Burslu

2. Data Manipulation

In [21]:
#Casting every data to lowercase letters to easily process.
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].apply(replace_turkish_letters)

2.1. Doğum Tarihi

In [22]:
# Dictionary to map Turkish month names to English month names
turkish_to_english_months = {
    'ocak': 'january',
    'şubat': 'february',
    'mart': 'march',
    'nisan': 'april',
    'mayıs': 'may',
    'haziran': 'june',
    'temmuz': 'july',
    'ağustos': 'august',
    'eylül': 'september',
    'ekim': 'october',
    'kasım': 'november',
    'aralık': 'december',
    'kasim': 'november',
    'mayis': 'may',
    'aralik': 'december'
}

# Function to replace Turkish month names with English month names
def replace_turkish_months(date_str):
    for turkish, english in turkish_to_english_months.items():
        date_str = date_str.replace(turkish, english)
    return date_str

# Iterate over each row in the 'Dogum Tarihi' column
for index, row in df.iterrows():
    dogum_tarihi = str(row['Dogum Tarihi'])

    # Replace Turkish month names with English month names
    dogum_tarihi = replace_turkish_months(dogum_tarihi)
    

    # Try parsing with dayfirst=True
    parsed_date = pd.to_datetime(dogum_tarihi, errors='coerce', dayfirst=True)
    
    # If parsing fails, try dayfirst=False
    # If still nan move on
    if pd.isna(parsed_date):
        parsed_date = pd.to_datetime(dogum_tarihi, errors='coerce', dayfirst=False)
    
        # If still NaT, set to np.nan
    if pd.isna(parsed_date):
        parsed_date = np.nan
    

    # Update the DataFrame
    df.at[index, 'Dogum Tarihi'] = parsed_date



  parsed_date = pd.to_datetime(dogum_tarihi, errors='coerce', dayfirst=True)
  parsed_date = pd.to_datetime(dogum_tarihi, errors='coerce', dayfirst=True)
  parsed_date = pd.to_datetime(dogum_tarihi, errors='coerce', dayfirst=True)


2.2. Yas Column

In [23]:
# Creating Age column
df['Dogum Tarihi'] = pd.to_datetime(df['Dogum Tarihi'], errors='coerce')
df['Dogum Yili'] = df['Dogum Tarihi'].dt.year
df['Yas'] = df['Basvuru Yili'] - df['Dogum Yili']

# Replace negative ages with NaN
df.loc[df['Yas'] <= 0, 'Yas'] = np.nan
df.loc[df['Yas'] >= 100, 'Yas'] = np.nan


2.3. Dogum Yeri

In [24]:
# Create an instance of the Cities class
cities = Cities()

# Iterate over each row in the 'Dogum Yeri' column
for index, row in df.iterrows():
    dogum_yeri = str(row['Dogum Yeri'])
    city_name = cities.search_city_name_in_text(dogum_yeri)
    if city_name != None:
        df.at[index, 'Dogum Yeri'] = city_name
    else:
        city_name = cities.search_counties_in_text(dogum_yeri)
        if city_name != None:
            df.at[index, 'Dogum Yeri'] = city_name
        else:
            df.at[index, 'Dogum Yeri'] = np.nan


2.4. Ikametgah Sehri

In [25]:
# Create an instance of the Cities class
cities = Cities()

# Iterate over each row in the 'Ikametgah Sehri' column
for index, row in df.iterrows():
    dogum_yeri = str(row['Ikametgah Sehri'])
    city_name = cities.search_city_name_in_text(dogum_yeri)
    if city_name != None:
        df.at[index, 'Ikametgah Sehri'] = city_name
    else:
        city_name = cities.search_counties_in_text(dogum_yeri)
        if city_name != None:
            df.at[index, 'Ikametgah Sehri'] = city_name
        else:
            df.at[index, 'Ikametgah Sehri'] = np.nan

2.5. Universite Kacinci Sinif

In [26]:
def convert_to_numeric_from_sinif(value):
        if isinstance(value, str):
            if value == 'hazirlik':
                return 0
            if value == 'mezun':
                return 5
            if value == 'yüksek lisans':
                return 6
            if value == 'tez':
                return 7 
            
        elif isinstance(value, int):
            return value
        return 0

for index, row in df.iterrows():
    avg_grade = row['Universite Kacinci Sinif']
    avg_grade = convert_to_numeric_from_sinif(avg_grade)
    df.at[index, 'Universite Kacinci Sinif'] = avg_grade

2.6. Universite Not Ortalamasi

In [27]:
df['Universite Not Ortalamasi'].unique()

array(['3.50-3', '3.00-2.50', nan, '2.50 ve alti', '3.00 - 4.00',
       '3.50 - 4.00', '3.00 - 3.50', '2.50 -3.00', 'ortalama bulunmuyor',
       '4-3.5', '2.50 - 3.00', '2.00 - 2.50', '1.00 - 2.50',
       'not ortalamasi yok', '4.0-3.5', '3.00 - 3.49', '2.50 - 2.99',
       '1.80 - 2.49', 'hazirligim', '0 - 1.79'], dtype=object)

In [29]:
import re
import numpy as np

def convert_to_numeric(value):
    if isinstance(value, str):
        # Handle ranges like '3.50-3' or '3.00 - 2.50'
        match = re.findall(r"\d+\.\d+|\d+", value)
        if len(match) == 2:
            return (float(match[0]) + float(match[1])) / 2
        elif len(match) == 1:
            return float(match[0])
        elif "altı" in value:  # handle '2.50 ve altı' and 'hazırlığım'
            return float(2.50)  # Use a code to represent such cases
        elif "ortalama bulunmuyor" in value or "not ortalamasi yok" in value or 'hazirligim' in value:
            return np.nan  # Handle missing or unavailable data
    elif isinstance(value, (int, float)):
        return value  # Return numeric values as-is
    return np.nan

for index, row in df.iterrows():
    avg_grade = row['Universite Not Ortalamasi']
    avg_grade = convert_to_numeric(avg_grade)
    df.at[index, 'Universite Not Ortalamasi'] = avg_grade

In [30]:
df['Universite Not Ortalamasi'].unique()

array([3.25, 2.75, nan, 2.5, 3.5, 3.75, 2.25, 1.75, 3.245, 2.745, 2.145,
       0.895], dtype=object)

2.7. Lise Adi and Lise Adi Diger

In [31]:
for index, row in df.iterrows():
    if type(row['Lise Adi']) != str:
        df.at[index, 'Lise Adi'] = row['Lise Adi Diger']

2.8. Lise Sehir

In [32]:
# Create an instance of the Cities class
cities = Cities()

# Iterate over each row in the 'Ikametgah Sehri' column
for index, row in df.iterrows():
    dogum_yeri = str(row['Lise Sehir'])
    city_name = cities.search_city_name_in_text(dogum_yeri)
    if city_name != None:
        df.at[index, 'Lise Sehir'] = city_name
    else:
        city_name = cities.search_counties_in_text(dogum_yeri)
        if city_name != None:
            df.at[index, 'Lise Sehir'] = city_name
        else:
            df.at[index, 'Lise Sehir'] = np.nan

2.9. Lise Bolum and Lise Bolum Diger

In [33]:
for index, row in df.iterrows():
    if type(row['Lise Bolumu']) != str:
        df.at[index, 'Lise Bolumu'] = row['Lise Bolum Diger']



2.10. Lise Ortalama Giris Puanı

In [34]:
def find_closest_match_with_a_threshold(text, dataframe_column, threshold=60):
    result = process.extractOne(text, dataframe_column)
    if result is None:
        return None
    closest_match, score = result[:2]
    return closest_match if score > threshold else None

# Function to find the closest match using fuzzy search
def find_the_lise_avg_point(row):
    # Get data from the current row
    city_name = row['Lise Sehir']
    school_name = row['Lise Adi']
    bolumu = row['Lise Bolumu']

    avg_point = 0
    
    if not isinstance(city_name, str) or not isinstance(school_name, str):
        return np.nan
    
    # Filter lise_frame based on the city name
    filtered_lise_frame = lise_frame[lise_frame['İL'] == city_name]
    
    most_similar_school_name = find_closest_match_with_a_threshold(school_name, filtered_lise_frame['OKUL ADI'])
    if most_similar_school_name == None:
        return np.nan
    
    if isinstance(bolumu, str):
        filtered_lise_frame = filtered_lise_frame[filtered_lise_frame['OKUL ADI'] == most_similar_school_name]
        most_similar_bolumu = find_closest_match_with_a_threshold(bolumu, filtered_lise_frame['ALAN'])
        avg_point = filtered_lise_frame[filtered_lise_frame['ALAN'] == most_similar_bolumu]['ORTALAMA GIRIS PUANI'].mean()
    else:
        avg_point = filtered_lise_frame[filtered_lise_frame['OKUL ADI'] == most_similar_school_name]['ORTALAMA GIRIS PUANI'].mean()

    return avg_point if avg_point != None else filtered_lise_frame['ORTALAMA GIRIS PUANI'].mean()

In [35]:
for index, row in df.iterrows():
    df.at[index, 'lise giris ort'] = find_the_lise_avg_point(row)

Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '-']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '-']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '-']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '-']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '------']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '------']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '------']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '------']
Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '------']
Applied processor reduces input query to empty s

2.11. Lise Turu

In [36]:
def normalize_lise_types(value):
    if isinstance(value, str):
        if 'anadolu' in value:
            return 'anadolu'
        elif 'meslek' in value:
            return 'meslek'
        elif 'fen' in value:
            return 'fen'
        elif 'özel' in value:
            return 'özel'
        elif 'düz' in value:
            return 'düz'
        elif 'imam hatip' in value:
            return 'imam hatip'
        elif 'sosyal' in value:
            return 'sosyal'
        elif 'teknik' in value:
            return 'teknik'
        elif 'sanat' in value:
            return 'sanat'
        elif 'diğer' in value:
            return 'diğer'
        elif 'devlet' in value:
            return 'devlet'
        else:
            return np.nan

for index, row in df.iterrows():
    avg_grade = row['Lise Turu']
    avg_grade = normalize_lise_types(avg_grade)
    df.at[index, 'Lise Turu'] = avg_grade

2.11. Lise Mezuniyet Notu

In [37]:
def convert_to_numeric_base100(value):
    if isinstance(value, str):
        # Handle ranges like '3.50-3' or '3.00 - 2.50'
        match = re.findall(r"\d+\.\d+|\d+", value)
        
        if len(match) == 2:
            low = float(match[1])
            high = float(match[0])
            avg = (high + low) / 2

            # Check if it's likely based on a 4.0 scale, otherwise assume it's a 100 scale
            if high <= 4.0 and low <= 4.0:
                return (avg / 4.0) * 100
            elif high > 4.0 and low > 4.0:
                return avg  # Keep it in 100-point scale
            else:
                return avg  # This can handle mixed cases (in case it's misrepresented)

        elif len(match) == 1:
            # Assume it's a single value and needs to be standardized to 100
            single_value = float(match[0])
            if single_value <= 4.0:
                return (single_value / 4.0) * 100
            else:
                return single_value

        elif "altı" in value:  # handle '2.50 ve altı'
            return (2.50 / 4.0) * 100  # Convert it to 100 scale
        
        elif "nan" in str(value) or "not ortalaması yok" in value:
            return np.nan  # Handle missing or unavailable data

    return np.nan


for index, row in df.iterrows():
    avg_grade = row['Lise Mezuniyet Notu']
    avg_grade = convert_to_numeric_base100(avg_grade)
    df.at[index, 'Lise Mezuniyet Notu'] = avg_grade

2.12. Burs ile Alakalı Sütunlar

2.13. Anne Egitim Durumu and Baba Egitim Durumu

In [38]:
ilkokul = replace_turkish_letters('ilkokul')
def normalize_egitim_durumu(value):
    if isinstance(value, str):
        if 'yok' in value or 'nan' in str(value):
            return 0
        elif str(ilkokul) in replace_turkish_letters(value):
            return 1
        elif 'ortaokul' in value:
            return 2
        elif 'lise' in value:
            return 3
        elif 'üniversite' in value:
            return 4
        elif 'doktora' in value or 'doktara' in value:
            return 6
        elif 'yüksek' in value:
            return 5
        
            
        

for index, row in df.iterrows():
    egitim_durumu = row['Anne Egitim Durumu']
    egitim_durumu = normalize_egitim_durumu(egitim_durumu)
    df.at[index, 'Anne Egitim Durumu'] = egitim_durumu

for index, row in df.iterrows():
    egitim_durumu = row['Baba Egitim Durumu']
    egitim_durumu = normalize_egitim_durumu(egitim_durumu)
    df.at[index, 'Baba Egitim Durumu'] = egitim_durumu

2.14. Kardes Sayisi

In [39]:
for index, row in df.iterrows():
    kardes_sayisi = row['Kardes Sayisi']
    
    if str(kardes_sayisi) == 'nan':
        kardes_sayisi = np.nan
    else:
        try:
            kardes_sayisi = int(kardes_sayisi)
        except ValueError:
            df.at[index, 'Kardes Sayisi'] = 1
        else:
            df.at[index, 'Kardes Sayisi'] = kardes_sayisi

2.15. Sosyal Sorumluluk ve Sporla Alakalı Sütunlar

Karmaşık derecede çok datası olanlar silindi diğerleri eğitimde fark yaratacaktır.

2.16. Deneyimler ve yetenekler

In [40]:
for index, row in df.iterrows():
    seviye = row["Ingilizce Seviyeniz?"]
    if isinstance(seviye, str):
        seviye = replace_turkish_letters(seviye)
    numerical = 0
    if  seviye == "ileri":
        numerical = 3
    elif row["Ingilizce Seviyeniz?"] == "orta":
        numerical = 2
    elif row["Ingilizce Seviyeniz?"] == "başlangıç":
        numerical = 1
    else:
        numerical = 0

    df.at[index, "Ingilizce Seviyeniz?"] = numerical


3. Saving new table

In [41]:
df.to_csv('processed_train_data.csv', index=False)