In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

### Reading csv and converted values in status column : acquired = 1 and else = 0
df = pd.read_csv('data.csv', converters={'status': lambda x : int(x == 'acquired')})
df.head()

Unnamed: 0.1,Unnamed: 0,state_code,latitude,longitude,zip_code,id,city,Unnamed: 6,name,labels,...,object_id,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
0,1005,CA,42.35888,-71.05682,92101,c:6669,San Diego,,Bandsintown,1,...,c:6669,0,1,0,0,0,0,1.0,0,1
1,204,CA,37.238916,-121.973718,95032,c:16283,Los Gatos,,TriCipher,1,...,c:16283,1,0,0,1,1,1,4.75,1,1
2,1001,CA,32.901049,-117.192656,92121,c:65620,San Diego,San Diego CA 92121,Plixi,1,...,c:65620,0,0,1,0,0,0,4.0,1,1
3,738,CA,37.320309,-122.05004,95014,c:42668,Cupertino,Cupertino CA 95014,Solidcore Systems,1,...,c:42668,0,0,0,1,1,1,3.3333,1,1
4,1002,CA,37.779281,-122.419236,94105,c:65806,San Francisco,San Francisco CA 94105,Inhale Digital,0,...,c:65806,1,1,0,0,0,0,1.0,1,0


In [77]:
### how many null values are present
df.isnull().sum()

Unnamed: 0                    0
state_code                    0
latitude                      0
longitude                     0
zip_code                      0
id                            0
city                          0
Unnamed: 6                  493
name                          0
labels                        0
founded_at                    0
closed_at                   588
first_funding_at              0
last_funding_at               0
age_first_funding_year        0
age_last_funding_year         0
age_first_milestone_year    152
age_last_milestone_year     152
relationships                 0
funding_rounds                0
funding_total_usd             0
milestones                    0
state_code.1                  1
is_CA                         0
is_NY                         0
is_MA                         0
is_TX                         0
is_otherstate                 0
category_code                 0
is_software                   0
is_web                        0
is_mobil

In [78]:
### dropped columns
df = df.drop(['Unnamed: 0','Unnamed: 6', 'name', 'founded_at', 'closed_at', 'first_funding_at', 'last_funding_at', 'id', 'object_id', 'labels'], axis=1)
### replaced NA/null values with 0
df['age_first_milestone_year'] = df['age_first_milestone_year'].fillna(axis=0,value=0)
df['age_last_milestone_year'] = df['age_last_milestone_year'].fillna(axis=0,value=0)
df.sample()

Unnamed: 0,state_code,latitude,longitude,zip_code,city,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,...,is_othercategory,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
101,WA,39.78373,-100.445882,98104,Seattle,5.789,5.789,0.0,0.0,1,...,0,0,0,0,0,0,1,4.0,1,1


In [79]:
### extracted feature columns and target column
features = df.loc[:, df.columns != 'status']
target = df.loc[:, 'status']

### preprocessing - converted string(object) values into incremental value
le = preprocessing.LabelEncoder()
feature_names = features.columns
for feature_name in feature_names:
    if feature_name != 'status' and features[feature_name].dtypes == object:
        features[feature_name] = le.fit_transform(features[feature_name])

### dataframe to array
features_array = features.values
target = target.values

### split the dataset into test and train  train:test=80:20
X_train, X_test, y_train, y_test  = train_test_split(features, target, train_size=0.8)

### print confusion matrix and calculate accuracy rate
def print_performance(pred,actual):
    actual_array = np.array(actual)
    unique_label = np.unique([actual, pred])
    cf = pd.DataFrame(
        confusion_matrix(actual_array, pred, labels=unique_label), 
        index=['Actual:{:}'.format(x) for x in unique_label], 
        columns=['Pred:{:}'.format(x) for x in unique_label]
    )
    print(cf)
    print('Percent Acquired correctly predicted: ', cf['Pred:1'][1]/(cf['Pred:0'][1] +cf['Pred:1'][1])*100)
    print('Percent Not Acquired correctly predicted: ', cf['Pred:0'][0]/(cf['Pred:0'][0] +cf['Pred:1'][0])*100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features[feature_name] = le.fit_transform(features[feature_name])


In [83]:
### Logistic Regression
logistic_clf = LogisticRegression(solver='liblinear', penalty='l1', random_state=42)
logistic_clf.fit(X_train, y_train)
y_pred = logistic_clf.predict(X_test)
print_performance(y_pred, y_test)

          Pred:0  Pred:1
Actual:0      36      33
Actual:1      15     101
Percent Acquired correctly predicted:  87.06896551724138
Percent Not Acquired correctly predicted:  52.17391304347826
