# Improved Model Notebook

In [8]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score

# Load dataset
file_path = './data/p22v29_KMP_csv.csv'
df = pd.read_csv(
    file_path, 
    index_col=["pid"], 
    encoding='ISO-8859-1', 
    dtype={"column_name": "float64"}  # 문제의 열을 정확한 유형으로 지정
)


# Rename columns for better readability
df = df.rename(columns={
    'p22gender': 'sex',
    'p22age1': 'age',
    'p22d30001': 'call_time_mins',
    'p22c02003': 'combined',
    'p22c02001': 'who_pay_fee',
    'p22a03002': 'phone_type',
    'p22a03008': 'mobile_carrier',
    'p22a03024': 'voice_unlimited',
    'p22a03026': 'data_unlimited',
    'p22a03014': 'welfare_discount',
    'p22a03038': 'phone_period',
    'p22a03028': 'phone_manufacturer',
    'p22l01001': 'tablet',
})

# Filter data for age 65 and above
df = df[df['age'] >= 65]

# Drop rows with missing values
df = df.dropna()


  df = pd.read_csv(


In [9]:

# Feature and target selection
X = df[['age', 'call_time_mins', 'phone_period']]
y = df['combined']

# Clean and preprocess data
X = X.apply(pd.to_numeric, errors='coerce').dropna()
y = y[X.index]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Polynomial features for capturing non-linear relationships
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)


In [10]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and calculate R2 score
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

# Save predictions
df_cleaned = df.loc[X.index]
df_cleaned['predicted_combined'] = model.predict(X_poly)
df_cleaned.to_csv('/mnt/data/improved_model_results_cleaned.csv', index=False)

print(f"R2 Score: {r2}")


OSError: Cannot save file into a non-existent directory: '/mnt/data'