In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Fungsi untuk membersihkan data
def clean_data(df):
    # Memastikan kolom 'pages' adalah string
    df['pages'] = df['pages'].astype(str)
    
    # Menghapus baris yang mengandung nilai tidak valid
    invalid_pages = ['ebook', 'Paperback', 'Hardcover', 'book']
    df = df[~df['pages'].str.contains('|'.join(invalid_pages), case=False, na=False)]
    
    # Mengonversi kolom 'pages' ke integer jika memungkinkan, jika tidak NaN
    df['pages'] = df['pages'].apply(lambda x: int(x.split()[0]) if x.split()[0].isdigit() else np.nan)
    
    # Mengonversi kolom 'author_followers' ke integer, menangani 'k'
    def convert_followers(x):
        try:
            if 'k' in x:
                return int(float(x[:-1]) * 1000)
            else:
                return int(x)
        except:
            return np.nan
    
    df['author_followers'] = df['author_followers'].apply(convert_followers)
    
    # Menghapus baris yang mengandung NaN setelah pembersihan
    df.dropna(inplace=True)
    
    return df

# Membaca dataset
train_features = pd.read_csv('dataset/train_features.csv')
train_labels = pd.read_csv('dataset/train_labels.csv')
test_features = pd.read_csv('dataset/test_features.csv')

# Menggabungkan fitur dan label sebelum pembersihan
train_data = pd.concat([train_features, train_labels], axis=1)

# Membersihkan data
train_data = clean_data(train_data)
test_features = clean_data(test_features)

# Memisahkan fitur dan label setelah pembersihan
X_train = train_data.drop(columns=['votes'])
y_train = train_data['votes']

# Melatih model
model = LinearRegression()
model.fit(X_train, y_train)

# Prediksi
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(test_features)

# Evaluasi model
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)
intercept = model.intercept_
coefficients = model.coef_

# Menampilkan hasil
print("Intercept:", intercept)
print("Coefficients:", coefficients)
print("Mean Squared Error (Train):", mse_train)
print("R2 Score (Train):", r2_train)

# Menguji model dengan data test
print("Test Predictions:", y_test_pred)

# Menyimpan prediksi ke dalam file CSV
test_predictions = pd.DataFrame(y_test_pred, columns=['votes'])
test_predictions.to_csv('test_predictions.csv', index=False)
