In [151]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.ensemble import RandomForestRegressor

In [152]:
train=pd.read_csv("/content/train_data.csv")
test=pd.read_csv("/content/test_data.csv")

In [153]:
def Avg_owner(x):
    if pd.isna(x):
        return 0  # or np.nan
    try:
        # normalize dash
        x = x.replace("–", "-").replace("—", "-")
        y = x.split("-")
        if len(y) != 2:
            return 0
        # remove commas, convert to float, then round and cast to int
        num1 = int(float(y[0].strip().replace(',', '')))
        num2 = int(float(y[1].strip().replace(',', '')))
        return int((num1 + num2) // 2)  # final value is an integer
    except:
        return 0  # fallback if parsing fails

# apply to train and test
train["Avg owners"] = train["Estimated owners"].apply(Avg_owner)
test["Avg owners"]  = test["Estimated owners"].apply(Avg_owner)
subtask1=test["Avg owners"]


In [154]:
numericfeatures=['Metacritic score','Recommendations','Positive',"Negative",'Avg owners']
features=numericfeatures+["Genres"]
x_train=train[features]
y_train=train["Price"]
x_test=test[features]

In [155]:
preprocessor=ColumnTransformer(transformers=[
    ('genres',TfidfVectorizer(max_features=50),'Genres'),
    ('numericFeatures' ,StandardScaler(),numericfeatures)
])

In [156]:
model=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',RandomForestRegressor(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        n_jobs=-1,
        random_state=42))
])

In [157]:
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [158]:
solution=pd.DataFrame()
for i in range(test.shape[0]):
    id=test['AppID'][i]
    sol=pd.DataFrame({
        'subtaskID':[1],
        'datapointID':[id],
        'answer':int(float(subtask1[i]))
    })
    solution=pd.concat([solution,sol],ignore_index=True)
for i in range(test.shape[0]):
    id=test['AppID'][i]
    sol=pd.DataFrame({
        'subtaskID':[2],
        'datapointID':[id],
        'answer':y_pred[i]
    })
    solution=pd.concat([solution,sol],ignore_index=True)
solution.to_csv('submission1.csv',index=False)