In [5]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# -----------------------------------
# 1) EXTRACT + SELECT COLUMNS
# -----------------------------------
# Load dataset
df = pd.read_csv("train.csv")

# Select required columns
df_titanic = df[["PassengerId", "Survived", "Pclass", "Sex", "Age", "Fare", "Embarked"]].copy()

# Rename Sex → Gender to match assignment terminology
df_titanic = df_titanic.rename(columns={"Sex": "Gender"})

# -----------------------------------
# 2) BUILD PIPELINE (TRANSFORM)
# -----------------------------------

# Numeric and categorical columns
numeric_features = ["Age", "Fare"]
categorical_features = ["Gender", "Embarked"]

# Numeric pipeline: fill missing with median + scale
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),    # Fill missing Age/Fare with median
    ("scaler", StandardScaler())                      # Scale numeric columns
])

# Categorical pipeline: fill missing with most frequent + one-hot encode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),        # Fill missing Embarked/Gender
    ("onehot", OneHotEncoder(handle_unknown="ignore"))           # One-Hot encoding
])

# ColumnTransformer: apply pipelines to columns
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat_onehot", categorical_transformer, categorical_features),
    ],
    remainder="passthrough"   # Keep PassengerId, Survived, Pclass, Gender, Embarked as they are
)

# Full sklearn Pipeline
full_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor)
])

# Fit + transform
X_processed = full_pipeline.fit_transform(df_titanic)

# Get feature names after ColumnTransformer
feature_names = full_pipeline.named_steps["preprocessor"].get_feature_names_out()

# Put into DataFrame
df_processed = pd.DataFrame(X_processed, columns=feature_names)

print("Processed columns:")
print(df_processed.columns)
print(df_processed.head())

Processed columns:
Index(['num__Age', 'num__Fare', 'cat_onehot__Gender_female',
       'cat_onehot__Gender_male', 'cat_onehot__Embarked_C',
       'cat_onehot__Embarked_Q', 'cat_onehot__Embarked_S',
       'remainder__PassengerId', 'remainder__Survived', 'remainder__Pclass'],
      dtype='object')
   num__Age  num__Fare  cat_onehot__Gender_female  cat_onehot__Gender_male  \
0 -0.565736  -0.502445                        0.0                      1.0   
1  0.663861   0.786845                        1.0                      0.0   
2 -0.258337  -0.488854                        1.0                      0.0   
3  0.433312   0.420730                        1.0                      0.0   
4  0.433312  -0.486337                        0.0                      1.0   

   cat_onehot__Embarked_C  cat_onehot__Embarked_Q  cat_onehot__Embarked_S  \
0                     0.0                     0.0                     1.0   
1                     1.0                     0.0                     0.0   
2

In [9]:
df_processed.columns

Index(['num__Age', 'num__Fare', 'cat_onehot__Gender_female',
       'cat_onehot__Gender_male', 'cat_onehot__Embarked_C',
       'cat_onehot__Embarked_Q', 'cat_onehot__Embarked_S',
       'remainder__PassengerId', 'remainder__Survived', 'remainder__Pclass'],
      dtype='object')

In [10]:
# -----------------------------------
# 3) BUILD FINAL CLEANED DATAFRAME
# -----------------------------------

df_clean = pd.DataFrame()

# 1. الأعمدة الأصلية المهمة
df_clean["PassengerId"] = df_processed["remainder__PassengerId"].astype(int)
df_clean["Survived"]    = df_processed["remainder__Survived"].astype(int)
df_clean["Pclass"]      = df_processed["remainder__Pclass"].astype(int)

# 2. الأعمدة الرقمية (بعد الـ scaling)
df_clean["Age"]  = df_processed["num__Age"].astype(float)
df_clean["Fare"] = df_processed["num__Fare"].astype(float)

# 3. ترميز الجنس Sex: male=1, female=0
# cat_onehot__Gender_male يكون 1 إذا راكب male، و 0 إذا female
df_clean["Sex"] = df_processed["cat_onehot__Gender_male"].astype(int)

# 4. ترميز Embarked: C=0, Q=1, S=2 من أعمدة one-hot
embarked_onehot = df_processed[
    ["cat_onehot__Embarked_C",
     "cat_onehot__Embarked_Q",
     "cat_onehot__Embarked_S"]
]

# نضرب المتجه [0,1,2] في مصفوفة one-hot → يعطينا رقم 0 أو 1 أو 2
df_clean["Embarked"] = embarked_onehot.dot([0, 1, 2]).astype(int)

# 5. DataFrame إضافية فيها كل أعمدة الـ one-hot (لو تحتاجها لجزء الـ ML)
onehot_cols = [c for c in df_processed.columns if c.startswith("cat_onehot__")]
df_onehot = df_processed[onehot_cols].copy()

# 6. df_final = كل شيء (لجزء E مثلاً)
df_final = pd.concat([df_clean, df_onehot], axis=1)

# 7. فحص القيم المفقودة
print("Missing values per column in df_clean:")
print(df_clean.isna().sum())

print("\nHead of df_clean (for MySQL):")
print(df_clean.head())

print("\nHead of df_final (with one-hot):")
print(df_final.head())


Missing values per column in df_clean:
PassengerId    0
Survived       0
Pclass         0
Age            0
Fare           0
Sex            0
Embarked       0
dtype: int64

Head of df_clean (for MySQL):
   PassengerId  Survived  Pclass       Age      Fare  Sex  Embarked
0            1         0       3 -0.565736 -0.502445    1         2
1            2         1       1  0.663861  0.786845    0         0
2            3         1       3 -0.258337 -0.488854    0         2
3            4         1       1  0.433312  0.420730    0         2
4            5         0       3  0.433312 -0.486337    1         2

Head of df_final (with one-hot):
   PassengerId  Survived  Pclass       Age      Fare  Sex  Embarked  \
0            1         0       3 -0.565736 -0.502445    1         2   
1            2         1       1  0.663861  0.786845    0         0   
2            3         1       3 -0.258337 -0.488854    0         2   
3            4         1       1  0.433312  0.420730    0         2   
4

In [11]:
import pymysql

connection = pymysql.connect(
    host="localhost",
    user="root",
    password="Oman99690050#",
    charset="utf8mb4",
    autocommit=True
)
cursor = connection.cursor()

cursor.execute("CREATE DATABASE IF NOT EXISTS titanic_db;")
cursor.execute("USE titanic_db;")
print("Database 'titanic_db' created successfully!")


Database 'titanic_db' created successfully!


In [16]:
# -----------------------------------
# 4) CREATE TABLE titanic_clean (drop + create)
# -----------------------------------

# احذف الجدول القديم أولاً لو موجود
cursor.execute("DROP TABLE IF EXISTS titanic_clean;")

# الآن أنشئ الجدول بالتركيب الصحيح
cursor.execute("""
CREATE TABLE titanic_clean (
    PassengerId INT PRIMARY KEY,
    Survived TINYINT,
    Pclass TINYINT,
    Age DOUBLE,
    Fare DOUBLE,
    Sex TINYINT,        -- male=1, female=0
    Embarked TINYINT    -- C=0, Q=1, S=2
);
""")

connection.commit()
print("Table 'titanic_clean' dropped (if existed) and created successfully!")


Table 'titanic_clean' dropped (if existed) and created successfully!


In [17]:
# -----------------------------------
# 5) LOAD CLEANED DATA INTO MySQL
# -----------------------------------

insert_sql = """
INSERT INTO titanic_clean (
    PassengerId, Survived, Pclass,
    Age, Fare, Sex, Embarked
)
VALUES (%s,%s,%s,%s,%s,%s,%s);
"""

# إعداد البيانات دفعة واحدة في قائمة records
records = [
    (
        int(row["PassengerId"]),
        int(row["Survived"]),
        int(row["Pclass"]),
        float(row["Age"]),
        float(row["Fare"]),
        int(row["Sex"]),
        int(row["Embarked"])
    )
    for _, row in df_clean.iterrows()
]

cursor.executemany(insert_sql, records)
connection.commit()

print("All cleaned rows inserted successfully into 'titanic_clean'!")

# Optional: verify
cursor.execute("SELECT COUNT(*) FROM titanic_clean;")
print("Rows in titanic_clean:", cursor.fetchone()[0])


All cleaned rows inserted successfully into 'titanic_clean'!
Rows in titanic_clean: 891


In [18]:
cursor.execute("SHOW COLUMNS FROM titanic_clean;")
for col in cursor.fetchall():
    print(col)

('PassengerId', 'int', 'NO', 'PRI', None, '')
('Survived', 'tinyint', 'YES', '', None, '')
('Pclass', 'tinyint', 'YES', '', None, '')
('Age', 'double', 'YES', '', None, '')
('Fare', 'double', 'YES', '', None, '')
('Sex', 'tinyint', 'YES', '', None, '')
('Embarked', 'tinyint', 'YES', '', None, '')
