In [33]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import numpy as np

In [34]:
data = pd.read_csv('produksiPadi.csv')

In [35]:

data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Tahun               53 non-null     int64  
 1   Produksi Padi(Ton)  53 non-null     object 
 2   Unnamed: 2          0 non-null      float64
 3   Unnamed: 3          0 non-null      float64
 4   Unnamed: 4          7 non-null      object 
 5   Unnamed: 5          2 non-null      object 
dtypes: float64(2), int64(1), object(3)
memory usage: 2.6+ KB


(   Tahun Produksi Padi(Ton)  Unnamed: 2  Unnamed: 3  \
 0   1970     18.693.649,00          NaN         NaN   
 1   1971     20.483.687,00          NaN         NaN   
 2   1972     19.393.933,00          NaN         NaN   
 3   1973     21.490.578,00          NaN         NaN   
 4   1974     22.476.073,00          NaN         NaN   
 
                           Unnamed: 4 Unnamed: 5  
 0                                NaN        NaN  
 1                                NaN        NaN  
 2  Ubah data tersebut secara lagging        NaN  
 3                                NaN        NaN  
 4    Lakukan Split Ratio --> 70 : 30        NaN  ,
 None)

In [36]:
# Membersihkan dataset
# Menghapus kolom yang tidak relevan
data_cleaned = data[['Tahun', 'Produksi Padi(Ton)']].copy()
     

In [37]:
# Mengubah kolom "Produksi Padi(Ton)" menjadi numerik (menghapus titik dan mengganti koma desimal)
data_cleaned['Produksi Padi(Ton)'] = (
    data_cleaned['Produksi Padi(Ton)']
    .str.replace('.', '', regex=False)  # Menghapus tanda titik
    .str.replace(',', '.', regex=False)  # Mengganti koma dengan titik
    .astype(float)  # Mengubah ke tipe data float
)

In [38]:
data_cleaned.head(), data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Tahun               53 non-null     int64  
 1   Produksi Padi(Ton)  53 non-null     float64
dtypes: float64(1), int64(1)
memory usage: 980.0 bytes


(   Tahun  Produksi Padi(Ton)
 0   1970          18693649.0
 1   1971          20483687.0
 2   1972          19393933.0
 3   1973          21490578.0
 4   1974          22476073.0,
 None)

In [39]:
# 1. Create lagging column
data_cleaned['Produksi (t-1)'] = data_cleaned['Produksi Padi(Ton)'].shift(1)


In [40]:
# hapus baris dengan nilai NaN
data_cleaned = data_cleaned.dropna()

In [41]:
# 2. Split data into training and testing sets (70:30 ratio)
X = data_cleaned[['Produksi (t-1)']]
y = data_cleaned['Produksi Padi(Ton)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [42]:
# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)



In [45]:
# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)


In [46]:
# Predict for 2023 and 2024
last_production_2022 = data_cleaned['Produksi Padi(Ton)'].iloc[-1]
pred_2023 = model.predict(pd.DataFrame({'Produksi (t-1)': [last_production_2022]}))[0]
pred_2024 = model.predict(pd.DataFrame({'Produksi (t-1)': [pred_2023]}))[0]

In [48]:

print("Prediksi Produksi Tahun 2023:", pred_2023)
print("Prediksi Produksi Tahun 2024:", pred_2024)

Prediksi Produksi Tahun 2023: 55703980.47924951
Prediksi Produksi Tahun 2024: 57078475.54822506
