In [124]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from scipy.stats import randint as sp_randint
from scipy.stats import uniform

from xgboost import XGBClassifier


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

df_train = train.copy()
df_test = test.copy()

print(df_train.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [125]:
# DATA Düzenleme
def extract_title(name):
    if pd.isnull(name):
        return "Unknown"
    return name.split(',')[1].split('.')[0].strip()

df_train['Title'] = df_train['Name'].apply(extract_title)
df_test['Title'] = df_test['Name'].apply(extract_title)

df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1

df_train['IsAlone'] = (df_train['FamilySize'] == 1).astype(int)
df_test['IsAlone'] = (df_test['FamilySize'] == 1).astype(int)

df_train['FarePerPerson'] = df_train['Fare'] / df_train['FamilySize']
df_test['FarePerPerson'] = df_test['Fare'] / df_test['FamilySize']

df_train['Deck'] = df_train['Cabin'].apply(lambda x: str(x)[0] if pd.notnull(x) else 'Unknown')
df_test['Deck'] = df_test['Cabin'].apply(lambda x: str(x)[0] if pd.notnull(x) else 'Unknown')

df_train['AgeBin'] = pd.cut(df_train['Age'], bins=[0, 12, 18, 35, 60, 120], labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])
df_test['AgeBin'] = pd.cut(df_test['Age'], bins=[0, 12, 18, 35, 60, 120], labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])

df_train['FareBin'] = pd.qcut(df_train['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'])
df_test['FareBin'] = pd.qcut(df_test['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'])

df_train['Pclass_Sex'] = df_train['Pclass'].astype(str) + "_" + df_train['Sex']
df_test['Pclass_Sex'] = df_test['Pclass'].astype(str) + "_" + df_test['Sex']

print(df_train[['Title', 'FamilySize', 'IsAlone', 'FarePerPerson', 'Deck', 'AgeBin', 'FareBin', 'Pclass_Sex']].head())

  Title  FamilySize  IsAlone  FarePerPerson     Deck      AgeBin   FareBin  \
0    Mr           2        0        3.62500  Unknown  YoungAdult       Low   
1   Mrs           2        0       35.64165        C       Adult  VeryHigh   
2  Miss           1        1        7.92500  Unknown  YoungAdult    Medium   
3   Mrs           2        0       26.55000        C  YoungAdult  VeryHigh   
4    Mr           1        1        8.05000  Unknown  YoungAdult    Medium   

  Pclass_Sex  
0     3_male  
1   1_female  
2   3_female  
3   1_female  
4     3_male  


In [126]:
print("Train Set Eksik Değerler:")
print(df_train.isnull().sum()[df_train.isnull().sum() > 0])

print("\nTest Set Eksik Değerler:")
print(df_test.isnull().sum()[df_test.isnull().sum() > 0])

Train Set Eksik Değerler:
Age         177
Cabin       687
Embarked      2
AgeBin      177
dtype: int64

Test Set Eksik Değerler:
Age               86
Fare               1
Cabin            327
FarePerPerson      1
AgeBin            86
FareBin            1
dtype: int64


In [127]:
df_train['Embarked'] = df_train['Embarked'].fillna("Missing")
df_test['Embarked'] = df_test['Embarked'].fillna("Missing")

In [128]:
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].median())

In [129]:
age_fill_map = df_train.groupby('Title')['Age'].median().to_dict()

df_train['Age'] = df_train.apply(lambda row: age_fill_map.get(row['Title'], df_train['Age'].median()) if pd.isnull(row['Age']) else row['Age'], axis=1)
df_test['Age'] = df_test.apply(lambda row: age_fill_map.get(row['Title'], df_test['Age'].median()) if pd.isnull(row['Age']) else row['Age'], axis=1)

In [130]:
df_train['AgeBin'] = pd.cut(df_train['Age'], bins=[0, 12, 18, 35, 60, 120], labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])
df_test['AgeBin'] = pd.cut(df_test['Age'], bins=[0, 12, 18, 35, 60, 120], labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])

df_train['FareBin'] = pd.qcut(df_train['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'])
df_test['FareBin'] = pd.qcut(df_test['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'])

In [131]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class ColumnTransformerToDF(BaseEstimator, TransformerMixin):
    def __init__(self, transformer, columns):
        self.transformer = transformer
        self.columns = columns
        
    def fit(self, X, y=None):
        self.transformer.fit(X, y)
        return self
    
    def transform(self, X):
        X_transformed = self.transformer.transform(X)
        return pd.DataFrame(X_transformed, columns=self.columns, index=X.index)

categorical_cols = ['Sex', 'Embarked', 'Title', 'Deck', 'AgeBin', 'FareBin', 'Pclass_Sex']
numerical_cols = ['Age', 'Fare', 'FamilySize']  
# --- Pipeline Adımları ---
# Sayısal veriler için imputer (ortalama ile doldurma)
num_transformer = SimpleImputer(strategy='mean')

# Kategorik veriler için imputer + encoder
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor_raw = ColumnTransformer(transformers=[
    ('num', num_transformer, numerical_cols),
    ('cat', cat_transformer, categorical_cols)
])

all_features = numerical_cols + categorical_cols

preprocessor = ColumnTransformerToDF(preprocessor_raw, all_features)


# Model pipeline 
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Target ve feature ayrımı
X_train = df_train[categorical_cols + numerical_cols]
y_train = df_train['Survived']

# Stratified K-Fold cross-validation ile doğruluk ölçümü
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')

print(f'Cross-Validation Accuracy Scores: {scores}')
print(f'Average CV Accuracy: {scores.mean():.4f}')

Cross-Validation Accuracy Scores: [0.7877095  0.79775281 0.76966292 0.79775281 0.8258427 ]
Average CV Accuracy: 0.7957


In [132]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier

# --- Ana modeller ---
logreg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

lgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05, random_state=42, verbose=-1))
])

# --- Ensemble: VotingClassifier (soft voting) ---
voting_clf = VotingClassifier(estimators=[
    ('lr', logreg),
    ('rf', rf),
    ('lgb', lgbm)
], voting='soft')  # voting='hard' da yapılabilir

# --- Cross-validation ---
X_train = df_train[categorical_cols + numerical_cols]
y_train = df_train['Survived']

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(voting_clf, X_train, y_train, cv=cv, scoring='accuracy')

print(f'VotingClassifier CV Accuracy Scores: {scores}')
print(f'VotingClassifier Average Accuracy: {scores.mean():.4f}')

VotingClassifier CV Accuracy Scores: [0.84916201 0.85393258 0.79775281 0.83707865 0.84269663]
VotingClassifier Average Accuracy: 0.8361


In [133]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier

# --- Ana modeller ---
logreg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

lgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05, random_state=42, verbose=-1))
])

# --- Ensemble: VotingClassifier (soft voting) ---
voting_clf = VotingClassifier(estimators=[
    ('lr', logreg),
    ('rf', rf),
    ('lgb', lgbm)
], voting='soft')  # voting='hard' da yapılabilir

# --- Cross-validation ---
X_train = df_train[categorical_cols + numerical_cols]
y_train = df_train['Survived']

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(voting_clf, X_train, y_train, cv=cv, scoring='accuracy')

print(f'VotingClassifier CV Accuracy Scores: {scores}')
print(f'VotingClassifier Average Accuracy: {scores.mean():.4f}')

VotingClassifier CV Accuracy Scores: [0.84916201 0.85393258 0.79775281 0.83707865 0.84269663]
VotingClassifier Average Accuracy: 0.8361


In [134]:
# Modeli eğit
voting_clf.fit(X_train, y_train)

# Test verisi için önişleme + tahmin
X_test = df_test[categorical_cols + numerical_cols]
predictions = voting_clf.predict(X_test)

# Örnek: Submission dosyası hazırlamak için
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': predictions
})

submission.to_csv('submission_voting.csv', index=False)
print("Tahmin dosyası oluşturuldu: submission_voting.csv")


Tahmin dosyası oluşturuldu: submission_voting.csv


In [135]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Base modeller
base_learners = [
    ('lr', LogisticRegression(max_iter=1000, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lgb', LGBMClassifier(n_estimators=100, learning_rate=0.05, random_state=42, verbose=-1))
]

# Meta model
meta_learner = LogisticRegression(max_iter=1000, random_state=42)

# StackingClassifier
stacked_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1,
    passthrough=True  # Base modellerin orijinal girdilerini de meta modele geçirir
)

# Preprocessing dahil pipeline
stacking_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacked_model', stacked_model)
])

# Cross-validation ile değerlendirme
scores = cross_val_score(stacking_pipeline, X_train, y_train, cv=5, scoring='accuracy')

print(f"StackingClassifier CV Scores: {scores}")
print(f"StackingClassifier Mean Accuracy: {scores.mean():.4f}")


StackingClassifier CV Scores: [0.83240223 0.82022472 0.84269663 0.8258427  0.87640449]
StackingClassifier Mean Accuracy: 0.8395


In [136]:
# Eğit ve tahmin et
stacking_pipeline.fit(X_train, y_train)
predictions_stack = stacking_pipeline.predict(X_test)

# Dosya oluştur
submission_stack = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': predictions_stack
})

submission_stack.to_csv("submission_stacking.csv", index=False)
print("Tahmin dosyası oluşturuldu: submission_stacking.csv")

Tahmin dosyası oluşturuldu: submission_stacking.csv


In [137]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform

In [138]:
!pip install scipy



In [139]:
param_dist = {
    'n_estimators': sp_randint(50, 500),
    'learning_rate': uniform(0.01, 0.2),
    'num_leaves': sp_randint(20, 150),
    'max_depth': sp_randint(3, 15),
    'min_child_samples': sp_randint(5, 100),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

In [140]:
# Model
lgbm_clf = LGBMClassifier(
    random_state=42,
    verbose=-1,
    verbosity=-1,
    force_row_wise=True
)


# Pipeline + model tuning
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lgbm_clf)
])

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions={
        'classifier__' + k: v for k, v in param_dist.items()
    },
    n_iter=50,
    scoring='accuracy',
    cv=5,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("En iyi skor:", random_search.best_score_)
print("En iyi parametreler:", random_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
En iyi skor: 0.8507501098487227
En iyi parametreler: {'classifier__colsample_bytree': np.float64(0.7380284992106732), 'classifier__learning_rate': np.float64(0.13687026894027277), 'classifier__max_depth': 4, 'classifier__min_child_samples': 57, 'classifier__n_estimators': 221, 'classifier__num_leaves': 87, 'classifier__subsample': np.float64(0.632341330533086)}


In [141]:
# Tahmin
preds = random_search.predict(X_test)

submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': preds
})

submission.to_csv("submission_tuned_lgbm.csv", index=False)
print("Tahmin dosyası hazır: submission_tuned_lgbm.csv")

Tahmin dosyası hazır: submission_tuned_lgbm.csv


In [142]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [145]:
# Modeller
rf_clf = RandomForestClassifier(random_state=42)
lr_clf = LogisticRegression(max_iter=1000, random_state=42)
xgb_clf = XGBClassifier(eval_metric='logloss', random_state=42)

# Pipeline içindeki preprocessor aynen kalacak

# Farklı modeller için pipeline tanımla
pipeline_rf = Pipeline([('preprocessor', preprocessor), ('classifier', rf_clf)])
pipeline_lr = Pipeline([('preprocessor', preprocessor), ('classifier', lr_clf)])
pipeline_xgb = Pipeline([('preprocessor', preprocessor), ('classifier', xgb_clf)])

# LightGBM zaten random_search içinde pipeline ile hazır

In [146]:
models = {
    'LightGBM (Tuned)': random_search.best_estimator_,
    'RandomForest': pipeline_rf,
    'LogisticRegression': pipeline_lr,
    'XGBoost': pipeline_xgb
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    print(f"{name} CV Accuracy Scores: {scores}")
    print(f"{name} Average Accuracy: {scores.mean():.4f}\n")

LightGBM (Tuned) CV Accuracy Scores: [0.83240223 0.83707865 0.87640449 0.83707865 0.87078652]
LightGBM (Tuned) Average Accuracy: 0.8508

RandomForest CV Accuracy Scores: [0.79888268 0.80898876 0.85393258 0.75280899 0.83707865]
RandomForest Average Accuracy: 0.8103

LogisticRegression CV Accuracy Scores: [0.78212291 0.76966292 0.79213483 0.79775281 0.79775281]
LogisticRegression Average Accuracy: 0.7879

XGBoost CV Accuracy Scores: [0.82122905 0.80898876 0.85955056 0.80337079 0.83707865]
XGBoost Average Accuracy: 0.8260



In [147]:
voting_clf = VotingClassifier(
    estimators=[
        ('lgbm', random_search.best_estimator_),
        ('rf', pipeline_rf),
        ('lr', pipeline_lr),
        ('xgb', pipeline_xgb)
    ],
    voting='soft',
    n_jobs=-1
)

voting_scores = cross_val_score(voting_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print(f"VotingClassifier CV Accuracy Scores: {voting_scores}")
print(f"VotingClassifier Average Accuracy: {voting_scores.mean():.4f}")

VotingClassifier CV Accuracy Scores: [0.84357542 0.8258427  0.87640449 0.8258427  0.85393258]
VotingClassifier Average Accuracy: 0.8451


In [148]:
# Gerekli kütüphaneler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Gelişmiş modelleme ve pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Modelleme
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb


import warnings
warnings.filterwarnings('ignore')


df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Orijinal veri yedekleme
train = df_train.copy()
test = df_test.copy()

# Train setine hedef sütunu ekleme
target = 'Survived'


In [149]:
voting_clf.fit(X_train, y_train)
final_preds = voting_clf.predict(X_test)

submission_final = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': final_preds
})

submission_final.to_csv('submission_final_ensemble.csv', index=False)
print(" Final tahmin dosyası hazır: submission_final_ensemble.csv")

 Final tahmin dosyası hazır: submission_final_ensemble.csv


In [150]:
def feature_engineering(df):
    # FamilySize sütunu
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # Title çıkarımı 
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 
                   'Rev', 'Sir', 'Jonkheer', 'Dona']
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # AgeBin ve FareBin - sayısal değişkenleri kategorik kutulara böl
    df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, 100], labels=False)
    df['FareBin'] = pd.qcut(df['Fare'], 4, labels=False)

    # Deck çıkarımı (kabin numarasından)
    df['Deck'] = df['Cabin'].str[0]
    df['Deck'] = df['Deck'].fillna('U')  # U: Unknown

    # Pclass_Sex kombine feature
    df['Pclass_Sex'] = df['Pclass'].astype(str) + "_" + df['Sex']

    return df


In [151]:
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

In [152]:
features = ['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'Title', 'AgeBin', 'FareBin', 'Deck', 'Pclass_Sex']

In [153]:
X_train = df_train[features]
X_test = df_test[features]

y_train = df_train['Survived']

In [154]:
# En iyi modeli al
best_model = random_search.best_estimator_  

# Test verisi üzerinde tahmin
test_preds = best_model.predict(df_test)

# Sonuçları DataFrame'e koy ve CSV olarak kaydet
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': test_preds
})

submission.to_csv('submission.csv', index=False)

print("Tahmin dosyası oluşturuldu: submission.csv")

Tahmin dosyası oluşturuldu: submission.csv


In [155]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

# Simülasyon için örnek veri
data = {
    'Age': [22, 38, np.nan, 35],
    'Fare': [7.25, 71.28, 8.05, np.nan],
    'FamilySize': [1, 1, 0, 0],
    'Sex': ['male', 'female', 'female', 'female'],
    'Embarked': ['S', 'C', 'Q', np.nan],
    'Title': ['Mr', 'Mrs', 'Miss', 'Mrs'],
    'Deck': ['C', np.nan, 'G', 'C'],
    'FareBin': ['low', 'high', 'low', 'mid'],
    'AgeBin': ['young', 'adult', 'young', 'adult'],
    'Pclass_Sex': ['3_male', '1_female', '3_female', '1_female'],
    'Survived': [0, 1, 1, 1]
}

df = pd.DataFrame(data)
X = df.drop('Survived', axis=1)
y = df['Survived']


# Elimizdeki veriye göre sütunları tanımlıyoruz
numeric_features = ['Age', 'Fare', 'FamilySize']
categorical_features = ['Sex', 'Embarked', 'Title', 'Deck', 'FareBin', 'AgeBin', 'Pclass_Sex']


In [156]:
# Sayısal veriler: Eksik değerleri medyanla doldur, sonra ölçekle
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Kategorik veriler: Eksik verileri 'missing' etiketiyle doldur, sonra OneHotEncode et
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Tüm sütunları birleştir
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [158]:
# Örnek model olarak RandomForest kullanalım 
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [159]:
# Cross-validation ile doğruluk değerlendirme
scores = cross_val_score(model_pipeline, X, y, cv=2)
print("Ortalama Doğruluk Skoru:", scores.mean())

Ortalama Doğruluk Skoru: 0.5


In [160]:
for col in X_train.columns:
    if col not in df_test.columns:
        df_test[col] = 0  

In [161]:
print(X_train.columns)

Index(['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'Title', 'AgeBin',
       'FareBin', 'Deck', 'Pclass_Sex'],
      dtype='object')


In [162]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Diyelim pipeline şu şekilde:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Embarked', 'Sex']),
        ('num', StandardScaler(), ['Age', 'Fare'])
    ])

# Eğer 'Embarked' yoksa, ya df_test veya X_train'e ekle,
# ya da transformer'dan çıkar.

In [163]:
for col in X_train.columns:
    if col not in df_test.columns:
        df_test[col] = 0  

for col in df_test.columns:
    if col not in X_train.columns:
        X_train[col] = 0  


In [169]:
# Eğitim
model_pipeline.fit(X_train, y_train)

# Test setinde tahmin
test_preds = model_pipeline.predict(df_test)

# Sonuçları kaydet
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': test_preds
})

submission.to_csv('final_submission.csv', index=False)
print("Tahmin dosyası kaydedildi.")


Tahmin dosyası kaydedildi.


In [170]:
categorical_cols = ['Sex', 'Embarked', 'Pclass']  
numerical_cols = ['Age', 'Fare']

In [167]:

# Kategorik işlem adımları
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Sayısal işlem adımları
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Preprocessing adımı
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numerical_cols)
])

# Model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [168]:
# Train/test'te sadece bu sütunları kullan
X_train_filtered = X_train[categorical_cols + numerical_cols]
df_test_filtered = df_test[categorical_cols + numerical_cols]

# Fit
model_pipeline.fit(X_train_filtered, y_train)

# Predict
test_preds = model_pipeline.predict(df_test_filtered)

# CSV oluştur
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': test_preds
})
submission.to_csv('final_submission.csv', index=False)
print("Tahmin dosyası kaydedildi.")


Tahmin dosyası kaydedildi.
