# Load the data

In [1]:
%run ./0-loading_the_data_define_variables.ipynb

# Handle the Missing values and Drop wrong input records 

In [2]:

initial_records = len(df)
# we are interested only on two state: successful and failed to develop the prediction model
df = df[df.apply(is_valid_state, axis=1)]
print(f'We dropped {initial_records - len(df)} records.')

We dropped 42448 records.


In [3]:
df.isnull().sum().sort_values(ascending=False)

usd_pledged      210
name               3
ID                 0
category           0
main_category      0
currency           0
deadline           0
goal               0
launched           0
pledged            0
state              0
backers            0
country            0
dtype: int64

In [4]:
# We are going to fill the missing values by zero or we can keep it as it is. 
# The reason is because we are not going to use this column while training our model.

df['usd_pledged'].fillna(0, inplace=True)


# Convert object column type to float , int and datetime

In [5]:

# Create an instance of the ColumnTypeConverter
converter = ColumnTypeConverter(df)
# Convert the columns to the specified data types
converter.convert_to_int(COLUMNS_CONVERT_TO_INT)
converter.convert_to_float(COLUMNS_CONVERT_TO_FLOAT)
converter.convert_to_datetime(COLUMNS_CONVERT_TO_DATETIME)


print('Done!')

Done!


# Currency conversion to US

In [6]:
from newforma_technical_test.src.feature_engineering import convert_goal_to_usd

# Apply the function to the DataFrame
df['goal_usd'] = df.apply(convert_goal_to_usd, axis=1)


# Deal with the Outliers 

# Deal with the multicolinearity

# Derived features

### Campaigns duration

In [7]:
df['campaigns_duration'] = (df['deadline'] - df['launched']).dt.days
df.head(2)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged,goal_usd,campaigns_duration
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,769.230769,58
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,45000.0,45


##  Derive features from deadline and launced date

In [8]:
df = df.assign(day_launched=df.launched.dt.day,
          month_launched=df.launched.dt.month,
          year_launched=df.launched.dt.year,
          week_day=df.launched.dt.weekday)


##  Derive season from the launched month 

In [9]:
from newforma_technical_test.src.feature_engineering import get_season_by_month


# create season column
df['season'] = df['month_launched'].apply(get_season_by_month)

# Remove unecessary columns

In [10]:
df.drop(columns=COLUMNS_TO_BE_DELETED, inplace=True)
df.head(2)

Unnamed: 0,category,main_category,goal,pledged,state,backers,country,usd_pledged,goal_usd,campaigns_duration,day_launched,month_launched,year_launched,week_day,season
0,Poetry,Publishing,1000.0,0.0,failed,0,GB,0.0,769.230769,58,11,8,2015,1,Summer
1,Narrative Film,Film & Video,45000.0,220.0,failed,3,US,220.0,45000.0,45,12,1,2013,5,Winter


# Handle the categorical variables

## Ordinal Variable

In [11]:
from newforma_technical_test.src.feature_engineering import ordinal_encoding
mapper= {'failed': 0, 'successful': 1 }
df = ordinal_encoding(df, 'state', mapper)
df.head(2)

Unnamed: 0,category,main_category,goal,pledged,state,backers,country,usd_pledged,goal_usd,campaigns_duration,day_launched,month_launched,year_launched,week_day,season
0,Poetry,Publishing,1000.0,0.0,0,0,GB,0.0,769.230769,58,11,8,2015,1,Summer
1,Narrative Film,Film & Video,45000.0,220.0,0,3,US,220.0,45000.0,45,12,1,2013,5,Winter


##  Nominal variables 

In [12]:
from newforma_technical_test.src.feature_engineering import binary_encoding


categorical_cols = ['category', 'main_category', 'country', 'season']
df = binary_encoding(df, categorical_cols)
df.head(2)

Unnamed: 0,category,main_category,goal,pledged,state,backers,country,usd_pledged,goal_usd,campaigns_duration,...,main_category_2,main_category_3,country_0,country_1,country_2,country_3,country_4,season_0,season_1,season_2
0,Poetry,Publishing,1000.0,0.0,0,0,GB,0.0,769.230769,58,...,0,1,0,0,0,0,1,0,0,1
1,Narrative Film,Film & Video,45000.0,220.0,0,3,US,220.0,45000.0,45,...,1,0,0,0,0,1,0,0,1,0


In [13]:
# create ordinal features anyway for later experimentation
# the reason is because we are getting a large number of columns and the computational ressources is limited.

CATEGORICAL_COLUMNS_TO_ENCODE=CATEGORICAL_COLUMNS_TO_ENCODE+['season']
df[CATEGORICAL_COLUMNS_TO_ENCODE] = df[CATEGORICAL_COLUMNS_TO_ENCODE].apply(LabelEncoder().fit_transform)
df.head(2)

Unnamed: 0,category,main_category,goal,pledged,state,backers,country,usd_pledged,goal_usd,campaigns_duration,...,main_category_2,main_category_3,country_0,country_1,country_2,country_3,country_4,season_0,season_1,season_2
0,107,12,1000.0,0.0,0,0,9,0.0,769.230769,58,...,0,1,0,0,0,0,1,0,0,1
1,92,6,45000.0,220.0,0,3,21,220.0,45000.0,45,...,1,0,0,0,0,1,0,0,1,0


# Handle the un-balanced dataset

In [15]:
from newforma_technical_test.src.data_split import handle_unbalanced_dataset


balanced_df = handle_unbalanced_dataset(df) 

# Split the data: training and testing

In [17]:
from newforma_technical_test.src.data_split import dataset_balanced_split


train_df, test_df = dataset_balanced_split(balanced_df, testing_percentage = 0.2)
len(train_df), len(test_df)

(269152, 67290)

In [19]:
len(test_df[test_df.state==1]) == len(test_df[test_df.state==0])

True

# Save the data 

In [20]:
train_file_path = '../data/preprocessed_train_ks_dataset.csv'
test_file_path = '../data/preprocessed_test_ks_dataset.csv'
train_df.to_csv(train_file_path, index=False)
test_df.to_csv(test_file_path, index=False)
print('Preprocessed dataset saved correctly!')

Preprocessed dataset saved correctly!


### 