# Linear regression with scikit-learn

In [2]:
import pandas as pd

# Load training data
train_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

# Load test data
test_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [7]:
# Drop 'Id' and 'SalePrice'
X_train = train_data.drop(['Id', 'SalePrice'], axis=1)
X_test = test_data.drop(['Id'], axis=1)
y_train = train_data['SalePrice']

In [1]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [4]:
# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [8]:
# Create preprocessing steps for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute with mean
    ('scaler', StandardScaler())
])

# Create preprocessing steps for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a preprocessing and modeling pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [11]:
# Save predictions
submission = pd.DataFrame({
    "Id": range(1461, 2920),
    "SalePrice": y_pred
})

# Save the DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

Unnamed: 0,Id,SalePrice
0,1461,112604.639001
1,1462,159671.713784
2,1463,186457.296566
3,1464,197184.595807
4,1465,205461.31948
