In [1]:
import pandas as pd
import numpy as np
import difflib
import re
from dateutil import parser
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the data while avoiding DtypeWarning by using low_memory=False
train_data = pd.read_csv('data/train.csv', low_memory=False)
test_data = pd.read_csv('data/test_x.csv', low_memory=False)

# Display the first few rows of the datasets
print(train_data.head())
print(test_data.head())

   Basvuru Yili  Degerlendirme Puani Cinsiyet  Dogum Tarihi        Dogum Yeri  \
0          2014                 52.0    Erkek      4/6/1994  Altindag, Ankara   
1          2014                 30.0    Erkek     6/11/1993           Üsküdar   
2          2014                 18.0    Erkek     1/15/1986            Samsun   
3          2014                 40.0    Erkek      6/4/1991        Diyarbakır   
4          2014                 24.0    Erkek  2 Kasim 1992   Ankara/Altındağ   

  Ikametgah Sehri                    Universite Adi Universite Turu  \
0          Ankara           İHSAN DOĞRAMACI BİLKENT            Özel   
1        İstanbul           İHSAN DOĞRAMACI BİLKENT            Özel   
2        İstanbul  ULUSLARARASI KIBRIS ÜNİVERSİTESİ            Özel   
3        İstanbul       İSTANBUL ŞEHİR ÜNİVERSİTESİ            Özel   
4          Ankara          TURGUT ÖZAL ÜNİVERSİTESİ            Özel   

   Burslu ise Burs Yuzdesi Burs Aliyor mu?  ... Spor Dalindaki Rolunuz Nedir?  \
0    

In [2]:
# Define the similar categories and their replacements for the 'Lise Turu' column
lise_turu_replacements = {
    # Group 1: 'Anadolu Lisesi'
    'Anadolu Lisesi': 'Anadolu Lisesi',
    'Anadolu lisesi': 'Anadolu Lisesi',
    'Düz lise': 'Anadolu Lisesi',
    'Düz Lise': 'Anadolu Lisesi',
    
    # Group 2: 'Özel Lise'
    'Özel': 'Özel Lise',
    'Özel Lisesi': 'Özel Lise',
    'Özel lisesi': 'Özel Lise',
    'Özel Lise': 'Özel Lise',
    
    # Group 3: 'Meslek Lisesi'
    'Meslek lisesi': 'Meslek Lisesi',
    'Meslek': 'Meslek Lisesi',
    'Meslek Lisesi': 'Meslek Lisesi',
    
    # Group 4: 'Fen Lisesi'
    'Fen Lisesi': 'Fen Lisesi',
    'Fen lisesi': 'Fen Lisesi'
}

# Replace the values in the 'Lise Turu' column of train_data
train_data['Lise Turu'] = train_data['Lise Turu'].replace(lise_turu_replacements)

# Display the updated 'Lise Turu' column to confirm the changes
print(train_data['Lise Turu'].value_counts())

Lise Turu
Anadolu Lisesi       27337
Diğer                13184
Özel Lise             7041
Meslek Lisesi         6535
Devlet                6204
Fen Lisesi            2819
İmam Hatip Lisesi     1158
Name: count, dtype: int64


In [3]:
# Define the similar categories and their replacements for the 'Lise Turu' column
lise_turu_replacements = {
    # Group 1: 'Anadolu Lisesi'
    'Anadolu Lisesi': 'Anadolu Lisesi',
    'Anadolu lisesi': 'Anadolu Lisesi',
    'Düz lise': 'Anadolu Lisesi',
    'Düz Lise': 'Anadolu Lisesi',
    
    # Group 2: 'Özel Lise'
    'Özel': 'Özel Lise',
    'Özel Lisesi': 'Özel Lise',
    'Özel lisesi': 'Özel Lise',
    'Özel Lise': 'Özel Lise',
    
    # Group 3: 'Meslek Lisesi'
    'Meslek lisesi': 'Meslek Lisesi',
    'Meslek': 'Meslek Lisesi',
    'Meslek Lisesi': 'Meslek Lisesi',
    
    # Group 4: 'Fen Lisesi'
    'Fen Lisesi': 'Fen Lisesi',
    'Fen lisesi': 'Fen Lisesi'
}

# Replace the values in the 'Lise Turu' column of train_data
train_data['Lise Turu'] = train_data['Lise Turu'].replace(lise_turu_replacements)

# Display the updated 'Lise Turu' column to confirm the changes
print(train_data['Lise Turu'].value_counts())

Lise Turu
Anadolu Lisesi       27337
Diğer                13184
Özel Lise             7041
Meslek Lisesi         6535
Devlet                6204
Fen Lisesi            2819
İmam Hatip Lisesi     1158
Name: count, dtype: int64


In [4]:
# First, normalize all values (convert to lowercase and remove spaces)
def clean_education(value):
    if pd.isna(value) or value == '0':  # If the value is NaN or '0', return None
        return None
    elif isinstance(value, str):  # If the value is a string
        value = value.lower().strip().replace(' ', '')
        # Normalize different variations of the same category
        if value in ['üniversite', 'üniversitemezunu', 'universite']:
            return 'universite'
        elif value in ['yükseklisans', 'yükseklisans/doktora', 'yükseklisans/doktara']:
            return 'yukseklisans'
        elif value in ['ilkokul', 'ilkokulmezunu', 'i̇lkokulmezunu']:
            return 'ilkokul'
        elif value in ['ortaokul', 'ortaokulmezunu', 'ortaokul']:
            return 'ortaokul'
        elif value in ['lise', 'lisemezunu', 'lise']:
            return 'lise'
        elif value in ['doktora', 'doktara']:
            return 'doktora'
        elif value in ['eğitimyok', 'egitimiyok', 'eğitimyok']:
            return 'egitimyok'
    return value  # Return the value as it is if it's already clean

# Define the encoding based on the cleaned categories
education_mapping = {
    'ilkokul': 1,
    'ortaokul': 2,
    'lise': 3,
    'universite': 4,
    'yukseklisans': 5,
    'doktora': 6,
    'egitimyok': 0
}

# Apply the cleaning function first, and then map the cleaned values to numeric for train data
train_data['Anne Egitim Durumu'] = train_data['Anne Egitim Durumu'].apply(clean_education).map(education_mapping)
train_data['Baba Egitim Durumu'] = train_data['Baba Egitim Durumu'].apply(clean_education).map(education_mapping)

# Apply the cleaning function first, and then map the cleaned values to numeric for test data
test_data['Anne Egitim Durumu'] = test_data['Anne Egitim Durumu'].apply(clean_education).map(education_mapping)
test_data['Baba Egitim Durumu'] = test_data['Baba Egitim Durumu'].apply(clean_education).map(education_mapping)

In [5]:
# Define a function to encode 'Lise Mezuniyet Notu' combining GPA and percentage ranges
def encode_highschool_graduation_score(value):
    if pd.isna(value) or value == 'Not ortalaması yok':  # Handle NaN and 'Not ortalaması yok'
        return None
    value = value.strip().lower().replace(' ', '')  # Normalize the value by removing spaces and lowercasing
    
    # Highest Range: GPA and percentage ranges
    if value in ['3.00-4.00', '4.00-3.50', '100-85', '75-100']:
        return 4  # Highest range
    
    # Upper-Mid Range: GPA and percentage ranges
    elif value in ['3.50-3.00', '3.50-3', '84-70', '50-75']:
        return 3  # Upper-mid range
    
    # Mid Range: GPA and percentage ranges
    elif value in ['3.00-2.50', '69-55', '50-74', '25-50', '25-49']:
        return 2  # Mid range
    
    # Lowest Range: GPA and percentage ranges
    elif value in ['2.50vealtı', '54-45', '44-0', '0-25', '0-24']:
        return 1  # Lowest range
    
    return None  # For unexpected values

# Apply the encoding function to the 'Lise Mezuniyet Notu' column for train data
train_data['Lise Mezuniyet Notu'] = train_data['Lise Mezuniyet Notu'].apply(encode_highschool_graduation_score)

# Apply the encoding function to the 'Lise Mezuniyet Notu' column for test data
test_data['Lise Mezuniyet Notu'] = test_data['Lise Mezuniyet Notu'].apply(encode_highschool_graduation_score)

In [6]:
# List of 81 official Turkish cities
known_cities = [
    'adana', 'adiyaman', 'afyonkarahisar', 'agri', 'aksaray', 'amasya', 'ankara', 'antalya', 'ardahan', 
    'artvin', 'aydin', 'balikesir', 'bartin', 'batman', 'bayburt', 'bilecik', 'bingol', 'bitlis', 
    'bolu', 'burdur', 'bursa', 'canakkale', 'cankiri', 'corum', 'denizli', 'diyarbakir', 'duzce', 
    'edirne', 'elazig', 'erzincan', 'erzurum', 'eskisehir', 'gaziantep', 'giresun', 'gumushane', 
    'hakkari', 'hatay', 'igdir', 'isparta', 'istanbul', 'izmir', 'kahramanmaras', 'karabuk', 
    'karaman', 'kars', 'kastamonu', 'kayseri', 'kilis', 'kirikkale', 'kirklareli', 'kirsehir', 
    'kocaeli', 'konya', 'kutahya', 'malatya', 'manisa', 'mardin', 'mersin', 'mugla', 'mus', 
    'nevsehir', 'nigde', 'ordu', 'osmaniye', 'rize', 'sakarya', 'samsun', 'sanliurfa', 'siirt', 
    'sinop', 'sirnak', 'sivas', 'tekirdag', 'tokat', 'trabzon', 'tunceli', 'usak', 'van', 
    'yalova', 'yozgat', 'zonguldak'
]

# Function to replace Turkish characters with their English equivalents
def replace_turkish_chars(city):
    if pd.isna(city):  # NaN check
        return city
    city = city.lower().strip()
    
    # Replace Turkish characters
    turkish_to_english = str.maketrans({
        'ç': 'c', 'ş': 's', 'ı': 'i', 'ö': 'o', 'ü': 'u', 'ğ': 'g', 'İ': 'i',
        'Ç': 'c', 'Ş': 's', 'Ö': 'o', 'Ü': 'u', 'Ğ': 'g'
    })
    return city.translate(turkish_to_english)

# Function to find the closest match to known cities
def get_closest_city(city):
    if isinstance(city, str):  # If the input is a string
        # Normalize the city and find the closest match
        closest_match = difflib.get_close_matches(city, known_cities, n=1, cutoff=0.6)
        return closest_match[0] if closest_match else None
    return city

# Apply city normalization and matching to the 'Ikametgah Sehri' column in the DataFrame
def process_city_data(df, city_column):
    # Replace Turkish characters
    df[city_column] = df[city_column].apply(replace_turkish_chars)
    
    # Match with the closest known city
    df[city_column] = df[city_column].apply(get_closest_city)
    
    # Handle NaN and None values the same way
    df[city_column] = df[city_column].replace([None], np.nan)
    
    # Get the unique matched cities
    unique_cities = df[city_column].unique()

    # Sort non-NaN (non-float) values
    cities_without_nan = [city for city in unique_cities if isinstance(city, str)]
    
    # Number cities (NaN will be 82)
    city_mapping = {city: idx for idx, city in enumerate(sorted(cities_without_nan), start=1)}
    nan_number = len(known_cities) + 1  # Assign NaN as 82
    
    # Add a new column without modifying the original
    df['Ikametgah Sehri Encoded'] = df[city_column].map(city_mapping)
    
    # Assign 82 to NaN values
    df['Ikametgah Sehri Encoded'] = df['Ikametgah Sehri Encoded'].fillna(nan_number).astype(int)
    
    return df, unique_cities

# Process the data (for example, using a DataFrame named 'train_data')
train_data, unique_cities_after_processing = process_city_data(train_data, 'Ikametgah Sehri')

# Process the data (for example, using a DataFrame named 'test_data')
test_data, unique_cities_after_processing = process_city_data(test_data, 'Ikametgah Sehri')

In [7]:
# Sample data preparation (assuming you have loaded your data already)
# train_data = pd.read_csv('path_to_train_data.csv')
# test_data = pd.read_csv('path_to_test_data.csv')

# Specify the important columns to keep
important_columns_train = [
    'Degerlendirme Puani', 'Cinsiyet', 'Dogum Tarihi', 'Ikametgah Sehri', 'Universite Adi', 'Universite Turu', 
    'Burs Aliyor mu?', 'Universite Kacinci Sinif', 'Universite Not Ortalamasi', 
    'Lise Turu', 'Lise Mezuniyet Notu', 'Baska Bir Kurumdan Burs Aliyor mu?', 'Anne Egitim Durumu', 'Anne Calisma Durumu', 
    'Baba Egitim Durumu', 'Baba Calisma Durumu', 'Kardes Sayisi', 
    'Girisimcilik Kulupleri Tarzi Bir Kulube Uye misiniz?', 
    'Profesyonel Bir Spor Daliyla Mesgul musunuz?', 'Aktif olarak bir STK üyesi misiniz?', 
    'Stk Projesine Katildiniz Mi?', 'Girisimcilikle Ilgili Deneyiminiz Var Mi?', 
    'Ingilizce Biliyor musunuz?', 'Ingilizce Seviyeniz?'
]

important_columns_test = [
    'id',
    'Cinsiyet', 'Dogum Tarihi', 'Ikametgah Sehri', 'Universite Adi', 'Universite Turu', 
    'Burs Aliyor mu?', 'Universite Kacinci Sinif', 'Universite Not Ortalamasi', 
    'Lise Turu', 'Lise Mezuniyet Notu', 'Baska Bir Kurumdan Burs Aliyor mu?', 'Anne Egitim Durumu', 'Anne Calisma Durumu', 
    'Baba Egitim Durumu', 'Baba Calisma Durumu', 'Kardes Sayisi', 
    'Girisimcilik Kulupleri Tarzi Bir Kulube Uye misiniz?', 
    'Profesyonel Bir Spor Daliyla Mesgul musunuz?', 'Aktif olarak bir STK üyesi misiniz?', 
    'Stk Projesine Katildiniz Mi?', 'Girisimcilikle Ilgili Deneyiminiz Var Mi?', 
    'Ingilizce Biliyor musunuz?', 'Ingilizce Seviyeniz?'
]

# Drop all columns except the important ones in both train and test datasets
train_data_filtered = train_data[important_columns_train].copy()
test_data_filtered = test_data[important_columns_test].copy()

# Define a function to clean and parse dates
def clean_date(date_series):
    # Dictionary for normalizing Turkish month names
    month_map = {
        'Ocak': '01', 'Şubat': '02', 'Mart': '03', 'Nisan': '04',
        'Mayıs': '05', 'Haziran': '06', 'Temmuz': '07', 'Ağustos': '08',
        'Eylül': '09', 'Ekim': '10', 'Kasım': '11', 'Aralık': '12'
    }

    def translate_and_clean(date_str):
        if pd.isna(date_str):
            return None

        # Normalize special characters and replace month names
        date_str = date_str.replace('Ş', 'S').replace('ı', 'i').replace('ç', 'c').replace('ğ', 'g').replace('ü', 'u').replace('ö', 'o')
        
        for turkish_month, numeric_month in month_map.items():
            if turkish_month in date_str:
                date_str = date_str.replace(turkish_month, numeric_month)

        # Replace common separators with dashes
        date_str = date_str.replace('/', '-').replace('.', '-').replace(',', '-')

        # Attempt to parse the date using a list of common formats
        formats = [
            "%d-%m-%Y", "%d-%m-%y", "%Y-%m-%d", "%Y/%m/%d", "%d/%m/%Y",
            "%d-%b-%Y", "%d-%b-%y", "%b-%d-%Y", "%b-%d-%y", "%d %B %Y", "%d %B %y",
            "%B %d, %Y", "%d %B %Y"
        ]
        
        for fmt in formats:
            try:
                return parser.parse(date_str, fuzzy=True).year
            except (ValueError, TypeError):
                continue
        
        return None

    return date_series.apply(translate_and_clean)

# Clean 'Dogum Tarihi' using the custom function
train_data_filtered['Dogum Tarihi'] = clean_date(train_data_filtered['Dogum Tarihi'])
test_data_filtered['Dogum Tarihi'] = clean_date(test_data_filtered['Dogum Tarihi'])

# Check for missing values and understand data loss
print("Train Data Missing 'Dogum Tarihi':", train_data_filtered['Dogum Tarihi'].isna().sum())
print("Test Data Missing 'Dogum Tarihi':", test_data_filtered['Dogum Tarihi'].isna().sum())

# Display the filtered and cleaned data
print(train_data_filtered.head())
print(test_data_filtered.head())

# Display the rows with missing 'Dogum Tarihi' for further inspection
missing_dates_train = train_data_filtered[train_data_filtered['Dogum Tarihi'].isna()]
missing_dates_test = test_data_filtered[test_data_filtered['Dogum Tarihi'].isna()]

print("Missing 'Dogum Tarihi' in Train Data:")
print(missing_dates_train.head(10))  # Displaying first 10 for inspection

print("Missing 'Dogum Tarihi' in Test Data:")
print(missing_dates_test.head(10))  # Displaying first 10 for inspection

Train Data Missing 'Dogum Tarihi': 1819
Test Data Missing 'Dogum Tarihi': 0
   Degerlendirme Puani Cinsiyet  Dogum Tarihi Ikametgah Sehri  \
0                 52.0    Erkek        1994.0          ankara   
1                 30.0    Erkek        1993.0        istanbul   
2                 18.0    Erkek        1986.0        istanbul   
3                 40.0    Erkek        1991.0        istanbul   
4                 24.0    Erkek        1992.0          ankara   

                     Universite Adi Universite Turu Burs Aliyor mu?  \
0           İHSAN DOĞRAMACI BİLKENT            Özel            Evet   
1           İHSAN DOĞRAMACI BİLKENT            Özel           Hayır   
2  ULUSLARARASI KIBRIS ÜNİVERSİTESİ            Özel           Hayır   
3       İSTANBUL ŞEHİR ÜNİVERSİTESİ            Özel            Evet   
4          TURGUT ÖZAL ÜNİVERSİTESİ            Özel            Evet   

  Universite Kacinci Sinif Universite Not Ortalamasi       Lise Turu  ...  \
0                        3   

In [8]:
# Step 1: Ensure 'Dogum Tarihi' is in the dataset and drop rows where it is missing
if 'Dogum Tarihi' in train_data_filtered.columns:
    train_data_filtered = train_data_filtered.dropna(subset=['Dogum Tarihi'])

# Step 2: Calculate 'Yas' (age) based on 'Dogum Tarihi' and drop 'Dogum Tarihi' if it exists
if 'Dogum Tarihi' in train_data_filtered.columns:
    current_year = pd.Timestamp.now().year
    train_data_filtered['Yas'] = current_year - train_data_filtered['Dogum Tarihi']
    train_data_filtered = train_data_filtered.drop(columns=['Dogum Tarihi'])

if 'Dogum Tarihi' in test_data_filtered.columns:
    test_data_filtered['Yas'] = current_year - test_data_filtered['Dogum Tarihi']
    test_data_filtered = test_data_filtered.drop(columns=['Dogum Tarihi'])

# Step 3: Fill missing values in numerical columns for train dataset (including excluded ones)
numerical_columns_train = train_data_filtered.select_dtypes(include=['number']).columns

# Exclude the columns you don't want to encode as numerical
exclude_from_encoding = ['Kardes Sayisi', 'Baska Kurumdan Aldigi Burs Miktari', 'Universite Kacinci Sinif']

# Fill missing values for numerical columns in training set
train_data_filtered[numerical_columns_train] = train_data_filtered[numerical_columns_train].fillna(train_data_filtered[numerical_columns_train].median())

# For the test dataset, exclude 'Degerlendirme Puani' and fill only the existing numerical columns
numerical_columns_test = test_data_filtered.select_dtypes(include=['number']).columns
test_data_filtered[numerical_columns_test] = test_data_filtered[numerical_columns_test].fillna(test_data_filtered[numerical_columns_test].median())

# Step 4: Fill missing values in categorical columns before encoding
categorical_columns = train_data_filtered.select_dtypes(include=['object']).columns

# Exclude 'Kardes Sayisi', 'Baska Kurumdan Aldigi Burs Miktari', 'Universite Kacinci Sinif' from encoding
categorical_columns = [col for col in categorical_columns if col not in exclude_from_encoding]

train_data_filtered[categorical_columns] = train_data_filtered[categorical_columns].fillna('Unknown')
test_data_filtered[categorical_columns] = test_data_filtered[categorical_columns].fillna('Unknown')

# Step 5: Encode only categorical variables using LabelEncoder, handle unseen labels in the test set
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    train_data_filtered[col] = le.fit_transform(train_data_filtered[col].astype(str))
    
    # Handle unseen labels in test set: if label not seen in training, assign -1
    test_data_filtered[col] = test_data_filtered[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

    # Save the encoder for future use if needed
    label_encoders[col] = le

# Display the final cleaned and encoded datasets
print(train_data_filtered.head())
print(test_data_filtered.head())

   Degerlendirme Puani  Cinsiyet  Ikametgah Sehri  Universite Adi  \
0                 52.0         2                7             439   
1                 30.0         2               40             439   
2                 18.0         2               40             387   
3                 40.0         2               40             471   
4                 24.0         2                7             371   

   Universite Turu  Burs Aliyor mu? Universite Kacinci Sinif  \
0                4                1                        3   
1                4                2                        3   
2                4                2                        1   
3                4                1                        3   
4                4                1                      NaN   

   Universite Not Ortalamasi  Lise Turu  Lise Mezuniyet Notu  ...  \
0                         13          0                  3.0  ...   
1                         11          0                  4.0  

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Assuming 'Degerlendirme Puani' is the target variable for regression
target_column = 'Degerlendirme Puani'

# Step 1: Split the data into features (X) and target (y)
X = train_data_filtered.drop(columns=[target_column])
y = train_data_filtered[target_column]

# Ensure that all categorical variables are encoded
categorical_columns = X.select_dtypes(include=['object']).columns

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))  # Encoding categorical variables
    label_encoders[col] = le  # Save encoder for possible future use

# Step 2: Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 4: Predict on the validation set
y_pred = model.predict(X_val)

# Step 5: Evaluate the model
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# Display the evaluation results
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")

# Optional: Make predictions on the test set
test_data_filtered_encoded = test_data_filtered.copy()

# Encode the categorical columns in the test set using the same encoders
for col in categorical_columns:
    test_data_filtered_encoded[col] = test_data_filtered_encoded[col].apply(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1)

# Ensure that the 'id' column is dropped before making predictions
if 'id' in test_data_filtered_encoded.columns:
    test_data_filtered_encoded = test_data_filtered_encoded.drop(columns=['id'])

# Make predictions on the test set
test_predictions = model.predict(test_data_filtered_encoded)
print("Test Predictions:", test_predictions[:5])

# Assuming the 'id' column is present in the original test data
# Create a DataFrame with the id and predicted scores
results = pd.DataFrame({
    'id': test_data_filtered['id'],  # Use the 'id' column from the original test data
    'Predicted_Puani': test_predictions
})

# Save the results to a CSV file
results.to_csv('predicted_scores.csv', index=False)

print("Prediction results saved to 'predicted_scores.csv'.")


Mean Squared Error (MSE): 120.97204789427285
Root Mean Squared Error (RMSE): 10.998729376354017
R-squared (R²): 0.6430762408881441
Test Predictions: [24.54536097 25.78828387 18.15293354 20.07890602 26.70186848]
Prediction results saved to 'predicted_scores.csv'.


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Step 3: Train a new model (Random Forest)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Predict on the validation set using the new model
y_pred_rf = rf_model.predict(X_val)

# Step 5: Evaluate the new model
mse_rf = mean_squared_error(y_val, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_val, y_pred_rf)

# Display the evaluation results for the new model
print(f"Random Forest Mean Squared Error (MSE): {mse_rf}")
print(f"Random Forest Root Mean Squared Error (RMSE): {rmse_rf}")
print(f"Random Forest R-squared (R²): {r2_rf}")

# Optional: Make predictions on the test set using the new model
test_predictions_rf = rf_model.predict(test_data_filtered_encoded)
print("Test Predictions (Random Forest):", test_predictions_rf[:5])

# Create a DataFrame with the 'id' and predicted scores for the new model
results_rf = pd.DataFrame({
    'id': test_data_filtered['id'],  # Use the 'id' column from the original test data
    'Predicted_Puani': test_predictions_rf
})

# Save the results to a CSV file for the new model
results_rf.to_csv('predicted_scores_random_forest.csv', index=False)

print("Random Forest prediction results saved to 'predicted_scores_random_forest.csv'.")


Random Forest Mean Squared Error (MSE): 43.32048098872683
Random Forest Root Mean Squared Error (RMSE): 6.5818296079985865
Random Forest R-squared (R²): 0.8721844492990345
Test Predictions (Random Forest): [33.96 18.76 13.5  18.36 40.16]
Random Forest prediction results saved to 'predicted_scores_random_forest.csv'.
