In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Fungsi untuk membersihkan data
def clean_data(df):
    df['pages'] = df['pages'].astype(str)
    invalid_pages = ['ebook', 'Paperback', 'Hardcover', 'book']
    df = df[~df['pages'].str.contains('|'.join(invalid_pages), case=False, na=False)]
    df['pages'] = df['pages'].apply(lambda x: int(x.split()[0]) if x.split()[0].isdigit() else np.nan)
    
    def convert_followers(x):
        try:
            if 'k' in x:
                return int(float(x[:-1]) * 1000)
            else:
                return int(x)
        except:
            return np.nan
    
    df['author_followers'] = df['author_followers'].apply(convert_followers)
    df.dropna(inplace=True)
    
    return df

# Membaca dataset
train_features = pd.read_csv('dataset/train_features.csv')
train_labels = pd.read_csv('dataset/train_labels.csv')
test_features = pd.read_csv('dataset/test_features.csv')

# Menggabungkan fitur dan label sebelum pembersihan
train_data = pd.concat([train_features, train_labels], axis=1)

# Membersihkan data
train_data = clean_data(train_data)
test_features = clean_data(test_features)

# Memisahkan fitur dan label setelah pembersihan
y_train = train_data['votes']

# Melatih model dan menampilkan hasil untuk setiap fitur
features = ['rating_avg', 'pages', 'author_followers']
results = {}
models = {}

for feature in features:
    X_train = train_data[[feature]]
    X_test = test_features[[feature]]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    mse_train = mean_squared_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)
    intercept = model.intercept_
    coefficients = model.coef_[0]
    
    results[feature] = {
        'Intercept': intercept,
        'Coefficients': coefficients,
        'Mean Squared Error (Train)': mse_train,
        'R2 Score (Train)': r2_train
    }
    
    models[feature] = model  # Simpan model untuk prediksi di kemudian hari
    
    print(f"Results for feature '{feature}':")
    print(f"Intercept: {intercept}")
    print(f"Coefficients: {coefficients}")
    print(f"Mean Squared Error (Train): {mse_train}")
    print(f"R2 Score (Train): {r2_train}")
    print()

# Menguji model dengan data test dan menyimpan prediksi ke dalam file CSV
test_predictions = pd.DataFrame({feature: models[feature].predict(test_features[[feature]]) for feature in features})
test_predictions.to_csv('test_predictions.csv', index=False)

# Menentukan hasil terbaik berdasarkan R2 Score
best_feature = max(results, key=lambda feature: results[feature]['R2 Score (Train)'])
best_result = results[best_feature]

print("Hasil Terbaik Berdasarkan Nilai R2:")
print(f"Feature: {best_feature}")
print(f"Intercept: {best_result['Intercept']}")
print(f"Coefficients: {best_result['Coefficients']}")
print(f"Mean Squared Error (Train): {best_result['Mean Squared Error (Train)']}")
print(f"R2 Score (Train): {best_result['R2 Score (Train)']}")


Results for feature 'rating_avg':
Intercept: -5228.913922840877
Coefficients: 1736.269602766072
Mean Squared Error (Train): 46913411.982034445
R2 Score (Train): 0.007382656954886491

Results for feature 'pages':
Intercept: 429.2103129045022
Coefficients: 3.6184723014961335
Mean Squared Error (Train): 46937212.7352426
R2 Score (Train): 0.006879068760939022

Results for feature 'author_followers':
Intercept: 1281.3872261191545
Coefficients: 0.023949393835432688
Mean Squared Error (Train): 45270849.438751884
R2 Score (Train): 0.04213681357267707

Hasil Terbaik Berdasarkan Nilai R2:
Feature: author_followers
Intercept: 1281.3872261191545
Coefficients: 0.023949393835432688
Mean Squared Error (Train): 45270849.438751884
R2 Score (Train): 0.04213681357267707


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pages'] = df['pages'].apply(lambda x: int(x.split()[0]) if x.split()[0].isdigit() else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['author_followers'] = df['author_followers'].apply(convert_followers)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.