In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('data/diamonds_train.csv')
predict = pd.read_csv('data/diamonds_test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [3]:
TARGET = 'price'

CAT_FEATURES = ['cut', 'color', 'clarity']
NUM_FEATURES = ['carat', 'depth', 'table', 'x', 'y', 'z']

In [4]:
for categorical_feature in CAT_FEATURES:
    train[categorical_feature] = train[categorical_feature].astype('category')
    predict[categorical_feature] = predict[categorical_feature].astype('category')

In [5]:
categorical_train_df = pd.get_dummies(train[CAT_FEATURES])
numerical_train_df = train[NUM_FEATURES]
train_df = pd.concat([categorical_train_df, numerical_train_df], axis=1)

In [6]:
categorical_predict_df = pd.get_dummies(predict[CAT_FEATURES])
numerical_predict_df = predict[NUM_FEATURES]
predict_df = pd.concat([categorical_predict_df, numerical_predict_df], axis=1)

In [7]:
FEATURES = categorical_train_df.columns.tolist() + numerical_train_df.columns.tolist()

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()

In [10]:
X = scaler.fit_transform(train_df[FEATURES])

In [11]:
y = train[TARGET]

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((30341, 26), (30341,), (10114, 26), (10114,))

In [14]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)

LinearRegression()

In [15]:
X_predict = scaler.transform(predict_df[FEATURES])

In [16]:
y_pred = model.predict(X_predict)

y_pred

array([ 3627.4126649 ,  6303.39453974, 10104.2200187 , ...,
        3871.32971237,  1961.21515052,   731.27228255])

In [17]:
diamond_id = predict['id']

In [18]:
submission = pd.DataFrame({'id' : diamond_id, 'price' : y_pred})

In [19]:
submission.to_csv('submissions/sample_submission.csv', index=False)