In [9]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import FunctionTransformer


In [27]:
data_train = pd.read_csv("train.csv")
data_target = pd.read_csv("target.csv")
submission = pd.read_csv("submission.csv")
submission_format = pd.read_csv("submission_format2.csv")

In [4]:
merge_data = pd.concat([data_train,data_target],axis=1)

In [5]:
merge_data.to_csv("train.csv",index=False)

In [10]:
# Fungsi khusus untuk mengonversi kolom Downloads yang berupa rentang (ex: "100000 - 500000")
def parse_download_range(X):
    # Jika X adalah DataFrame, ambil kolom pertama sebagai array
    if isinstance(X, pd.DataFrame):
        X = X.iloc[:, 0].values
    # Jika X adalah Series, kita sudah bisa mengakses nilainya dengan .values
    # Pastikan bahwa X sekarang adalah array NumPy
    def convert_range(s):
        try:
            # Pastikan s adalah string, lalu bagi berdasarkan '-'
            parts = str(s).split('-')
            if len(parts) != 2:
                return np.nan
            # Hilangkan spasi dan koma, kemudian konversi ke float
            low = float(parts[0].strip().replace(',', ''))
            high = float(parts[1].strip().replace(',', ''))
            # Kembalikan rata-rata dari dua angka tersebut
            return (low + high) / 2
        except Exception:
            return np.nan

    # Terapkan fungsi ke tiap elemen dan pastikan keluaran berbentuk (n_samples, 1)
    result = np.array([convert_range(val) for val in X])
    return result.reshape(-1, 1)

# Buat transformer khusus dari fungsi di atas
download_transformer = FunctionTransformer(parse_download_range, validate=False)

# --- MEMBACA DATA ---
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_format = pd.read_csv("submission_format.csv")
# Jika file target.csv diperlukan untuk keperluan mapping, bisa juga dibaca
# target_df = pd.read_csv("target.csv")

# Misal, target variabel adalah kolom 'coppaRisk' (pastikan sesuai dengan nama kolom di file Anda)
target_column = "coppaRisk"

# Pisahkan fitur dan target dari data latih
X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]

# --- MENDEFINISIKAN FITUR UNTUK PREPROCESSING ---

# Fitur numerik yang secara langsung dapat digunakan
numeric_features = ['userRatingCount', 'isCorporateEmailScore', 'adSpent', 'appAge', 'averageUserRating']

# Kolom 'Downloads' perlu diproses secara khusus karena berbentuk rentang string
downloads_feature = ["downloads"]

# Fitur kategori (nominal) yang akan dienkode menggunakan OneHotEncoder
categorical_features = ['developerCountry', 'countryCode', 'primaryGenreName', 'deviceType',
                        'hasPrivacyLink', 'hasTermsOfServiceLink']

# Fitur ordinal dengan nilai terurut ("low", "medium", "high")
ordinal_features = ['hasTermsOfServiceLinkRating', 'appContentBrandSafetyRating', 
                    'appDescriptionBrandSafetyRating', 'mfaRating']

# --- MEMBANGUN PIPELINE UNTUK PREPROCESSING ---

# Pipeline untuk fitur numerik
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Pipeline khusus untuk fitur "Downloads"
downloads_pipeline = Pipeline(steps=[
    ("download_parser", download_transformer),
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
# Pipeline untuk fitur kategori dengan konversi ke string
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("to_str", FunctionTransformer(lambda X: X.astype(str))),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Pipeline untuk fitur ordinal; asumsikan urutan kategorinya adalah ["low", "medium", "high"]
ordinal_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="low")),
    ("ordinal", OrdinalEncoder(categories=[["low", "medium", "high"]]*len(ordinal_features)))
])

# Gabungkan semua pipeline di dalam ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("downloads", downloads_pipeline, downloads_feature),
    ("cat", categorical_transformer, categorical_features),
    ("ord", ordinal_transformer, ordinal_features)
])

# --- MEMBANGUN PIPELINE MODEL ---
# Kami gunakan RandomForestClassifier sebagai contoh
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# --- EVALUASI MODEL (Optional) ---
# Misalkan kita ingin melakukan cross-validation pada data latih:
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy")
print("CV Accuracy scores: ", cv_scores)
print("Rata-rata CV Accuracy: ", np.mean(cv_scores))

# --- MELATIH MODEL PADA DATA LATIH ---
clf.fit(X_train, y_train)

# --- MEMPREDIKSI DATA UJI ---
# Pastikan bahwa struktur data pada test_df sama dengan X_train (misalnya, urutan dan nama kolom fitur)
test_predictions = clf.predict(test_df)

# Jika ingin mendapatkan probabilitas (misalnya untuk thresholding), bisa gunakan clf.predict_proba(test_df)

# --- MEMBENTUK FILE SUBMISSION ---
# Format file submission biasanya mengikuti struktur submission_format.csv
# Misal submission_format memiliki kolom 'id' dan 'coppaRisk'
# Jika test_df atau submission_format memiliki kolom 'id', gunakan sebagai identifier
if "id" in submission_format.columns:
    submission = pd.DataFrame({
        "id": submission_format["id"],
        "coppaRisk": test_predictions  # Atau bisa dipetakan ke label yang sesuai
    })
else:
    # Jika tidak ada kolom 'id', asumsikan barisnya sesuai urutan
    submission = pd.DataFrame({
        "coppaRisk": test_predictions
    })

# Simpan hasil prediksi ke file CSV
submission.to_csv("submission.csv", index=False)
print("File submission.csv telah disimpan.")

CV Accuracy scores:  [0.89928571 0.90571429 0.90214286 0.89214286 0.89857143]
Rata-rata CV Accuracy:  0.8995714285714286
File submission.csv telah disimpan.


In [25]:
merge_data = pd.concat([submission_format,submission],axis=1)
merge_data.to_csv("submission_format2.csv",index=False)

In [23]:
submission_format.drop('coppaRisk', axis=1, inplace=True)

In [24]:
submission_format.to_csv("submission_format2.csv",index=False)

In [19]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
import warnings
warnings.filterwarnings("ignore")

# Fungsi untuk mengonversi kolom Downloads dari format "min - max" menjadi rata-rata
def parse_download_range(X):
    # Jika X berupa DataFrame, ambil kolom pertama
    if isinstance(X, pd.DataFrame):
        X = X.iloc[:, 0].values
    def convert_range(s):
        try:
            parts = str(s).split('-')
            if len(parts) != 2:
                return np.nan
            low = float(parts[0].strip().replace(',', ''))
            high = float(parts[1].strip().replace(',', ''))
            return (low + high) / 2
        except Exception:
            return np.nan
    result = np.array([convert_range(val) for val in X])
    return result.reshape(-1, 1)

# Transformer untuk kolom Downloads
download_transformer = FunctionTransformer(parse_download_range, validate=False)

# --- MENDEFINISIKAN FITUR ---
# Fitur numerik
numeric_features = ['userRatingCount', 'isCorporateEmailScore', 'adSpent', 'appAge', 'averageUserRating']

# Kolom Downloads (jika tersedia)
downloads_feature = ["downloads"]

# Fitur kategori (pastikan nilai-nilainya homogen dengan konversi ke string)
categorical_features = ['developerCountry', 'countryCode', 'primaryGenreName', 'deviceType',
                        'hasPrivacyLink', 'hasTermsOfServiceLink']

# Fitur ordinal (dengan asumsi urutan: low < medium < high)
ordinal_features = ['hasTermsOfServiceLinkRating', 'appContentBrandSafetyRating', 
                    'appDescriptionBrandSafetyRating', 'mfaRating']

# Pipeline untuk fitur numerik
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Pipeline untuk fitur Downloads
downloads_pipeline = Pipeline(steps=[
    ("download_parser", download_transformer),
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Pipeline untuk fitur kategori (dengan konversi ke string agar tipe data homogen)
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("to_str", FunctionTransformer(lambda X: X.astype(str))),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Pipeline untuk fitur ordinal
ordinal_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="low")),
    ("ordinal", OrdinalEncoder(categories=[["low", "medium", "high"]] * len(ordinal_features)))
])

# Gabungkan semua transformer ke dalam ColumnTransformer.
# Jika kolom Downloads tidak ada di data, Anda bisa mengomentari bagian berikut.
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("downloads", downloads_pipeline, downloads_feature),
    ("cat", categorical_transformer, categorical_features),
    ("ord", ordinal_transformer, ordinal_features)
])

# --- MEMBACA DATA ---
# Asumsikan data sudah ada di file train.csv (atau hasil merge dengan target.csv jika diperlukan)
train_df = pd.read_csv("train.csv")
# Jika target terpisah di file lain, lakukan merge dulu:
# target_df = pd.read_csv("target.csv")
# train_df = pd.merge(train_df, target_df, on="id")

# Misalnya, target variabel adalah 'coppaRisk'
target_column = "coppaRisk"

X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]

# --- MODEL ALTERNATIF: GradientBoostingClassifier ---
# Membuat pipeline dengan model GradientBoostingClassifier
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", GradientBoostingClassifier(random_state=42))
])

# Evaluasi awal dengan cross validation
cv_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring="accuracy")
print("CV Accuracy (sebelum tuning):", cv_scores)
print("Rata-rata CV Accuracy:", np.mean(cv_scores))


# --- HYPERPARAMETER TUNING DENGAN GridSearchCV ---
param_grid = {
    "classifier__n_estimators": [50, 100, 150],
    "classifier__learning_rate": [0.01, 0.1, 0.2],
    "classifier__max_depth": [3, 5, 7],
}

grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

print("\nBest Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

# Setelah tuning, Anda bisa menggunakan grid_search.best_estimator_ sebagai model akhir
best_model = grid_search.best_estimator_

# Jika ingin memprediksi data test:
test_df = pd.read_csv("test.csv")
predictions = best_model.predict(test_df)
# contoh pembuatan file submission:
submission = pd.DataFrame({
    # sesuaikan dengan kolom identifikasi
    "coppaRisk": predictions
})
submission.to_csv("submission2.csv", index=False)


CV Accuracy (sebelum tuning): [0.90214286 0.90571429 0.9        0.89785714 0.89928571]
Rata-rata CV Accuracy: 0.901

Best Parameters: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 5, 'classifier__n_estimators': 150}
Best CV Accuracy: 0.9027142857142858


In [26]:
df_imputed.to_csv("imputed.csv", index=False)

NameError: name 'df_imputed' is not defined