In [1]:
## 한글 폰트 문제 해결 
# matplotlib은 한글 폰트를 지원하지 않음
# os정보
import platform
import matplotlib.pyplot as plt

# font_manager : 폰트 관리 모듈
# rc : 폰트 변경 모듈
from matplotlib import font_manager, rc
# unicode 설정
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic') # os가 macos
elif platform.system() == 'Windows':
    path = 'c:/Windows/Fonts/malgun.ttf' # os가 windows
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print("Unknown System")

In [2]:

import pandas as pd 

dataB = pd.read_csv("./Data/사전테스트-환경데이터/environmentsB.csv")
dataC = pd.read_csv("./Data/사전테스트-환경데이터/environmentsC.csv")
dataD = pd.read_csv("./Data/사전테스트-환경데이터/environmentsD.csv")
dataE = pd.read_csv("./Data/사전테스트-환경데이터/environmentsE.csv")

dataR = pd.read_excel("./Data/사전테스트-생육데이터.xlsx")


### 주차 df 만들기

In [3]:
def make_DayToWeek():
    from datetime import datetime, timedelta

    ## datetime date, time 분리
    datalist = [dataB, dataC, dataD, dataE]

    ## datetime을 date로 변경 
    for data in datalist:
        data['datetime'] = pd.to_datetime(data['datetime'])
        data['date'] = data['datetime'].dt.date
        data['time'] = data['datetime'].dt.hour
    

    base_dateB = datetime(2023, 10, 6)
    base_dateC = datetime(2023, 9, 22)  
    base_dateD = datetime(2023, 10, 18)  
    base_dateE = datetime(2023, 9, 22)  

    base_weekB = 4
    base_weekC = 1
    base_weekD = 4
    base_weekE = 1

    # 주차 계산 함수
    def calculate_week(date, base_date, base_week):
        base_date_timestamp = pd.Timestamp(base_date)

        # 날짜 차이 계산
        delta_days = (date - base_date_timestamp).dt.days

        # 기준 주차에서 날짜 차이를 주 단위로 변환
        week = base_week + delta_days // 7
        return week


    datesB = pd.to_datetime(dataB['date']) 
    datesC = pd.to_datetime(dataC['date']) 
    datesD = pd.to_datetime(dataD['date']) 
    datesE = pd.to_datetime(dataE['date']) 

    weeksB = calculate_week(datesB, base_dateB, base_weekB)
    weeksC = calculate_week(datesC, base_dateC, base_weekC)
    weeksD = calculate_week(datesD, base_dateD, base_weekD)
    weeksE = calculate_week(datesE, base_dateE, base_weekE)

    dataB['weeks'] = weeksB
    dataC['weeks'] = weeksC
    dataD['weeks'] = weeksD
    dataE['weeks'] = weeksE

    dataB.head()



# 데이터 정제 및 예측모델 함수화 작업

In [4]:
from sklearn.preprocessing import MinMaxScaler
def merge_input_output(outputfileName,saveFilename):
    #컬럼이름변경
    dataR.rename(columns={'생육주사': '생육주차'}, inplace=True)

    make_DayToWeek()
    

    # 주차와 input 데이터만 데이터셋으로 만듬
    dataCC = dataC.iloc[: ,[2,3,4,5,6,7,10]]
    dataBB = dataB.iloc[: ,[2,3,4,5,6,7,10]]
    dataDD = dataD.iloc[: ,[2,3,4,5,6,7,10]]
    dataEE = dataE.iloc[: ,[2,3,4,5,6,7,10]]

    dataT = pd.concat([dataBB, dataCC, dataDD, dataEE], ignore_index=True)

        # 주차와 input 데이터만 데이터셋으로 만듬
    dataC2 = dataC.iloc[: ,[0,2,3,4,5,6,7,10]]
    dataB2 = dataB.iloc[: ,[0,2,3,4,5,6,7,10]]
    dataD2 = dataD.iloc[: ,[0,2,3,4,5,6,7,10]]
    dataE2 = dataE.iloc[: ,[0,2,3,4,5,6,7,10]]

    # CO2 컬럼 정규화
    datalist2 = [dataB2, dataC2, dataD2, dataE2]

    for data in datalist2:

        min_x = data['innerCO2'] - data['innerCO2'].min()
        min_max = data['innerCO2'].max() - data['innerCO2'].min()
        
        normalCO2 = min_x / min_max
        data['CO2'] = normalCO2
        

    dataE2
    dataT = pd.concat([dataB2, dataC2, dataD2, dataE2], ignore_index=True)

    dataT.rename(columns={"생육주차":"주차"}, inplace=True)

    pivot = pd.read_csv(f"./Data/{outputfileName}.csv",)
    pivot.head()\
    
    dataT.rename(columns={"주차":"weeks"}, inplace=True)
    pivot.rename(columns={"주차":"weeks"}, inplace=True)     

    pivot = pivot.iloc[:,1:]   

    dataT[dataT['farm'] == 'B농가'].iloc[:, 1:]

    list = ['B농가','C농가','D농가','E농가']

    # for i in list:
    grouopB = dataT[dataT['farm'] == 'B농가'].iloc[:, 1:].groupby('weeks').mean()
    grouopC = dataT[dataT['farm'] == 'C농가'].iloc[:, 1:].groupby('weeks').mean()
    grouopD = dataT[dataT['farm'] == 'D농가'].iloc[:, 1:].groupby('weeks').mean()
    grouopE = dataT[dataT['farm'] == 'E농가'].iloc[:, 1:].groupby('weeks').mean()

    grouopB_reset = grouopB.reset_index()
    grouopC_reset = grouopC.reset_index()
    grouopD_reset = grouopD.reset_index()
    grouopE_reset = grouopE.reset_index()

    grouopB_reset['시설아이디'] = 'B농가'
    grouopC_reset['시설아이디'] = 'C농가'
    grouopD_reset['시설아이디'] = 'D농가'
    grouopE_reset['시설아이디'] = 'E농가'

    # grouopB = pivot[pivot['시설아이디'] == 'B농가'].iloc[:, 1:].groupby('weeks').mean()
    # grouopC = pivot[pivot['시설아이디'] == 'C농가'].iloc[:, 1:].groupby('weeks').mean()
    # grouopD = pivot[pivot['시설아이디'] == 'D농가'].iloc[:, 1:].groupby('weeks').mean()
    # grouopE = pivot[pivot['시설아이디'] == 'E농가'].iloc[:, 1:].groupby('weeks').mean()

    # grouopB_out = grouopB.reset_index()
    # grouopC_out = grouopC.reset_index()
    # grouopD_out = grouopD.reset_index()
    # grouopE_out = grouopE.reset_index()

    # grouopB_out['시설아이디'] = 'B농가'
    # grouopC_out['시설아이디'] = 'C농가'
    # grouopD_out['시설아이디'] = 'D농가'
    # grouopE_out['시설아이디'] = 'E농가'

    # pivotData = pd.concat([grouopB_out,grouopC_out,grouopD_out,grouopD_out], ignore_index=True)
    trainData = pd.concat([grouopB_reset,grouopC_reset,grouopD_reset,grouopE_reset], ignore_index=True)

    # dataT = pd.concat([dataB2, dataC2, dataD2, dataE2], ignore_index=True)
    df = pd.merge(pivot, trainData,
                    on=['시설아이디', 'weeks'],
                    how='inner'
                )
    # print(df)

    df.to_csv(f'./Data/{saveFilename}.csv')


In [5]:
merge_input_output(outputfileName="pivot",saveFilename="ML_mean")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CO2'] = normalCO2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CO2'] = normalCO2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CO2'] = normalCO2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [6]:
ml_mean = pd.read_csv("Data/ML_mean.csv")
ml_mean.head()

Unnamed: 0.1,Unnamed: 0,시설아이디,weeks,조사일자,표본번호,관부직경,엽병장,엽수,엽장,엽폭,...,초장,최종화방차수,화방 꽃수(소화수),supplyEC,supplyPH,innerCO2,innerHum,innerTemp,innerSolar,CO2
0,0,B농가,4,20231006,1,12.39,139.0,5.0,79.0,70.0,...,255.0,0.0,0.0,0.934939,5.998963,928.372744,79.071951,15.957256,120.365122,0.885022
1,1,B농가,4,20231006,2,12.59,146.0,5.0,78.0,77.0,...,251.0,0.0,0.0,0.934939,5.998963,928.372744,79.071951,15.957256,120.365122,0.885022
2,2,B농가,4,20231006,3,13.91,100.0,6.0,73.0,68.0,...,209.0,0.0,0.0,0.934939,5.998963,928.372744,79.071951,15.957256,120.365122,0.885022
3,3,B농가,4,20231006,4,10.36,111.0,4.0,83.0,80.0,...,264.0,0.0,0.0,0.934939,5.998963,928.372744,79.071951,15.957256,120.365122,0.885022
4,4,B농가,4,20231006,5,9.48,203.0,3.0,94.0,94.0,...,307.0,0.0,0.0,0.934939,5.998963,928.372744,79.071951,15.957256,120.365122,0.885022


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
import joblib

def MultiOutputRegressorFunc_KNN(df, saveFileName):

    
    # Define features and targets
    # features = ['supplyEC', 'supplyPH', 'innerCO2', 'innerHum', 'innerTemp', 'CO2']
    features = ['innerCO2', 'innerHum', 'innerTemp', 'CO2']
    targets = ['관부직경', '엽병장', '엽수', '엽장', '엽폭', '개화수', '초장', '최종화방차수', '화방 꽃수(소화수)']
    
    # Prepare the input and target data
    X = df[features]
    y = df[targets]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize KNN regressor
    knn_regressor = KNeighborsRegressor(n_neighbors=3)
    
    # Initialize and fit MultiOutputRegressor
    multi_output_regressor = MultiOutputRegressor(knn_regressor)
    multi_output_regressor.fit(X_train, y_train)
    
    # Evaluate the model
    score = multi_output_regressor.score(X_test, y_test)
    print(f'Model score: {score}')
    
    # Make predictions
    predictions = multi_output_regressor.predict(X_test)
    
    # Print some example predictions
    print("\nExample predictions:")
    print("실제값".ljust(15), *[f"{target[:7]:7}" for target in targets])
    print("예측값".ljust(15), *[f"{target[:7]:7}" for target in targets])
    print("-" * 100)
    for i in range(min(5, len(predictions))):  # Print first 5 predictions
        print("실제값".ljust(15), *[f"{val:7.2f}" for val in y_test.iloc[i]])
        print("예측값".ljust(15), *[f"{val:7.2f}" for val in predictions[i]])
        print("-" * 100)
    
    # # Save the model
    # filename = f'../Server/MLModels/{saveFileName}.joblib'
    # print(f"\n모델을 {filename}에 저장합니다.")
    # joblib.dump(multi_output_regressor, filename)
    
    return multi_output_regressor

# 사용 예시:
# df = pd.read_csv('your_data.csv')  # 데이터 로드
# model = MultiOutputRegressorFunc_KNN(df, 'multioutput_knn_model')

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
import joblib

def MultiOutputRegressorFunc_RF(df, saveFileName):

    
    # Define features and targets
    # features = ['supplyEC', 'supplyPH', 'innerCO2', 'inne2']
    features = [ 'innerCO2', 'innerHum', 'innerTemp', 'CO2']
    targets = ['관부직경', '엽병장', '엽수', '엽장', '엽폭', '개화수', '초장', '최종화방차수', '화방 꽃수(소화수)']
    
    # Prepare the input and target data
    X = df[features]
    y = df[targets]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize Random Forest regressor
    rf_regressor = RandomForestRegressor(n_estimators=150, random_state=42)
    
    # Initialize and fit MultiOutputRegressor
    multi_output_regressor = MultiOutputRegressor(rf_regressor)
    multi_output_regressor.fit(X_train, y_train)
    
    # Evaluate the model
    score = multi_output_regressor.score(X_test, y_test)
    print(f'Model score: {score}')
    
    # Make predictions
    predictions = multi_output_regressor.predict(X_test)
    
    # Print some example predictions
    print("\nExample predictions:")
    print("실제값".ljust(15), *[f"{target[:7]:7}" for target in targets])
    print("예측값".ljust(15), *[f"{target[:7]:7}" for target in targets])
    print("-" * 100)
    for i in range(min(5, len(predictions))):  # Print first 5 predictions
        print("실제값".ljust(15), *[f"{val:7.2f}" for val in y_test.iloc[i]])
        print("예측값".ljust(15), *[f"{val:7.2f}" for val in predictions[i]])
        print("-" * 100)
    
    # Calculate and print feature importances
    feature_importances = np.mean([tree.feature_importances_ for tree in multi_output_regressor.estimators_], axis=0)
    print("\nFeature Importances:")
    for feature, importance in zip(features, feature_importances):
        print(f"{feature}: {importance:.4f}")
    
    # # Save the model
    # filename = f'../Server/MLModels/{saveFileName}.joblib'
    # print(f"\n모델을 {filename}에 저장합니다.")
    # joblib.dump(multi_output_regressor, filename)
    
    return multi_output_regressor

# 사용 예시:
# df = pd.read_csv('your_data.csv')  # 데이터 로드
# model = MultiOutputRegressorFunc_RF(df, 'multioutput_rf_model')

## 기초통계량 column 추가 함수 test

In [13]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import MinMaxScaler

# def merge_input_output(outputfileName, saveFilename):
#     # 데이터 전처리
#     dataR.rename(columns={'생육주사': '생육주차'}, inplace=True)
#     make_DayToWeek()  # 이 함수는 별도로 정의되어 있어야 합니다

#     # 농가별 데이터 추출
#     farms = ['B', 'C', 'D', 'E']
#     data_dict = {}
#     for farm in farms:
#         data_dict[farm] = globals()[f'data{farm}'].iloc[:, [0,2,3,4,5,6,7,10]]
        
#         # CO2 정규화
#         min_x = data_dict[farm]['innerCO2'] - data_dict[farm]['innerCO2'].min()
#         min_max = data_dict[farm]['innerCO2'].max() - data_dict[farm]['innerCO2'].min()
#         data_dict[farm]['CO2'] = min_x / min_max

#     # 데이터 병합
#     dataT = pd.concat(list(data_dict.values()), ignore_index=True)
#     dataT.rename(columns={"생육주차": "weeks"}, inplace=True)

#     # pivot 데이터 읽기
#     pivot = pd.read_csv(f"./Data/{outputfileName}.csv")
#     pivot.rename(columns={"주차": "weeks"}, inplace=True)
#     pivot = pivot.iloc[:, 1:]

#     def calculate_stats(group):
#         if isinstance(group, pd.Series):
#             return pd.Series({
#                 'mean': group.mean(),
#                 'max': group.max(),
#                 'min': group.min(),
#                 'sum': group.sum(),
#                 'std': group.std()
#             })
#         else:
#             numeric_cols = group.select_dtypes(include=[np.number]).columns
#             return pd.Series({
#                 'mean': group[numeric_cols].mean(),
#                 'max': group[numeric_cols].max(),
#                 'min': group[numeric_cols].min(),
#                 'sum': group[numeric_cols].sum(),
#                 'std': group[numeric_cols].std()
#             })

#     stats_dfs = []
#     for farm in farms:
#         # dataT에 대한 통계
#         farm_data = dataT[dataT['farm'] == f'{farm}농가'].iloc[:, 1:]
#         numeric_cols = farm_data.select_dtypes(include=[np.number]).columns
#         group_dataT = farm_data.groupby('weeks').agg({col: calculate_stats for col in numeric_cols})
#         group_dataT = group_dataT.reset_index()
#         group_dataT.columns = ['weeks'] + [f'{col[0]}_{col[1]}_input' for col in group_dataT.columns[1:]]
#         group_dataT['시설아이디'] = f'{farm}농가'
        
#         # pivot에 대한 통계
#         farm_pivot = pivot[pivot['시설아이디'] == f'{farm}농가'].iloc[:, 1:]
#         numeric_cols = farm_pivot.select_dtypes(include=[np.number]).columns
#         group_pivot = farm_pivot.groupby('weeks').agg({col: calculate_stats for col in numeric_cols})
#         group_pivot = group_pivot.reset_index()
#         group_pivot.columns = ['weeks'] + [f'{col[0]}_{col[1]}_output' for col in group_pivot.columns[1:]]
        
#         # dataT와 pivot 통계 병합
#         merged_stats = pd.merge(group_dataT, group_pivot, on=['weeks'])
#         stats_dfs.append(merged_stats)

#     # 모든 농가의 통계를 하나의 DataFrame으로 결합
#     all_stats = pd.concat(stats_dfs, ignore_index=True)

#     # 원본 데이터와 통계 데이터 병합
#     result = pd.merge(pivot, all_stats, on=['시설아이디', 'weeks'], how='inner')

#     # 결과를 CSV 파일로 저장
#     result.to_csv(f'./Data/{saveFilename}.csv', index=False)

#     # return result

# # 함수 호출
# # result_df = merge_input_output('output_file_name', 'save_file_name')

In [14]:
# merge_input_output(outputfileName="pivot",saveFilename="ML_mean")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dict[farm]['CO2'] = min_x / min_max
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dict[farm]['CO2'] = min_x / min_max
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dict[farm]['CO2'] = min_x / min_max
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

ValueError: Must produce aggregated value

In [None]:
df = pd.read_csv("Data/ML_mean.csv")

In [None]:
MultiOutputRegressorFunc_RF(df=df,saveFileName="finaltest")

In [None]:
MultiOutputRegressorFunc_KNN(df=df,saveFileName="finaltest")