In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Fungsi untuk membersihkan data
def clean_data(df):
    df['pages'] = df['pages'].astype(str)
    invalid_pages = ['ebook', 'Paperback', 'Hardcover', 'book']
    df = df[~df['pages'].str.contains('|'.join(invalid_pages), case=False, na=False)]
    df['pages'] = df['pages'].apply(lambda x: int(x.split()[0]) if x.split()[0].isdigit() else np.nan)
    
    def convert_followers(x):
        try:
            if 'k' in x:
                return int(float(x[:-1]) * 1000)
            else:
                return int(x)
        except:
            return np.nan
    
    df['author_followers'] = df['author_followers'].apply(convert_followers)
    df.dropna(inplace=True)
    
    return df

# Membaca dataset
train_features = pd.read_csv('dataset/train_features.csv')
train_labels = pd.read_csv('dataset/train_labels.csv')
test_features = pd.read_csv('dataset/test_features.csv')

# Menggabungkan fitur dan label sebelum pembersihan
train_data = pd.concat([train_features, train_labels], axis=1)

# Membersihkan data
train_data = clean_data(train_data)
test_features = clean_data(test_features)

# Memisahkan fitur dan label setelah pembersihan
y_train = train_data['votes']

# Melatih model dan menampilkan hasil untuk setiap fitur
features = ['rating_avg', 'pages', 'author_followers']
results = {}

for feature in features:
    X_train = train_data[[feature]]
    X_test = test_features[[feature]]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    mse_train = mean_squared_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)
    intercept = model.intercept_
    coefficients = model.coef_[0]
    
    results[feature] = {
        'Intercept': intercept,
        'Coefficients': coefficients,
        'Mean Squared Error (Train)': mse_train,
        'R2 Score (Train)': r2_train
    }
    
    print(f"Results for feature '{feature}':")
    print(f"Intercept: {intercept}")
    print(f"Coefficients: {coefficients}")
    print(f"Mean Squared Error (Train): {mse_train}")
    print(f"R2 Score (Train): {r2_train}")
    print()

# Menyimpan prediksi ke dalam file CSV
test_predictions = pd.DataFrame({feature: y_test_pred for feature, y_test_pred in zip(features, [model.predict(test_features[[feature]]) for feature in features])})
test_predictions.to_csv('test_predictions.csv', index=False)
