In [2]:
import numpy as np
import pandas as pd

In [3]:
s = pd.Series(np.random.randn(5), index = ['a','b','c','d','e'])

In [4]:
s

a    0.425911
b   -1.265041
c    1.906677
d    0.886063
e    0.474171
dtype: float64

In [5]:
d = {'one' : pd.Series([1.,2.,3.],index=['a','b','c']),
     'two' : pd.Series([1.,2.,3.,4.],index=['a','b','c','d'])}

In [6]:
d

{'one': a    1.0
 b    2.0
 c    3.0
 dtype: float64,
 'two': a    1.0
 b    2.0
 c    3.0
 d    4.0
 dtype: float64}

In [7]:
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [8]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [12]:
df.to_csv('BDMI.csv')

In [13]:
df_2 = pd.read_csv('BDMI.csv')

In [14]:
df_2

Unnamed: 0.1,Unnamed: 0,one,two
0,a,1.0,1.0
1,b,2.0,2.0
2,c,3.0,3.0
3,d,,4.0


In [15]:
info = d = {'ID' : pd.Series([201.,202.,203.],index=['xiaoming','zhuj','chen']),
     'gender' : pd.Series(['male','male','male'],index=['xiaoming','zhuj','chen'])}
info

{'ID': xiaoming    201.0
 zhuj        202.0
 chen        203.0
 dtype: float64,
 'gender': xiaoming    male
 zhuj        male
 chen        male
 dtype: object}

In [16]:
mydf = pd.DataFrame(info)
mydf

Unnamed: 0,ID,gender
xiaoming,201.0,male
zhuj,202.0,male
chen,203.0,male


In [25]:
me = mydf.loc['xiaoming']
me

ID         201
gender    male
Name: xiaoming, dtype: object

In [26]:
df.iloc[0]

one    1.0
two    1.0
Name: a, dtype: float64

In [29]:
pd.DataFrame(mydf, columns = ['ID'])

Unnamed: 0,ID
xiaoming,201.0
zhuj,202.0
chen,203.0


In [40]:
def mse(A):
    sum = 0
    sum_loss = 0
    for i in A:
        sum += i
    mean = sum/len(A)
    for j in A:
        sum_loss += ((j-mean)**2)
    return sum_loss**0.5/len(A)
mse([72,94,79,83,65,81,73,67,85,82])

2.658759109058209

In [38]:
def ce(A,B):
    H = 0
    for i in range(len(A)):
        H += A[i]*np.log(B[i])
    H = -H
    return H
ce([1,0],[0.9,0.1])
ce([0,1],[0.2,0.8])

0.2231435513142097

In [41]:
import pandas as pd
import numpy as np

In [42]:
data = pd.read_excel('数据.xlsx')


In [43]:
data.head()

Unnamed: 0,Q1_性别,Q2_身高（厘米）,Q3_体重 （公斤）,Q4_头发长度（厘米）
0,男,190,70,7
1,女,160,45,20
2,男,179,61,5
3,女,173,60,50
4,男,175,70,15


In [44]:
data = data.rename(columns={'Q1_性别': 'label', 
                            'Q2_身高（厘米）': 'height', 
                            'Q3_体重 （公斤）': 'weight', 
                            'Q4_头发长度（厘米）': 'hair'})

In [45]:
data['label'] = data['label'].apply(lambda x : {'男': 0, '女': 1}[x])

In [46]:
data.head()

Unnamed: 0,label,height,weight,hair
0,0,190,70,7
1,1,160,45,20
2,0,179,61,5
3,1,173,60,50
4,0,175,70,15


In [48]:
features = data[['height', 'weight', 'hair']].to_numpy()
features

array([[190,  70,   7],
       [160,  45,  20],
       [179,  61,   5],
       [173,  60,  50],
       [175,  70,  15],
       [162,  53,  30],
       [170,  80,   5],
       [168,  55,   5],
       [175,  65,   4],
       [175,  70,   4],
       [158,  47,  50],
       [180,  68,   3],
       [180,  75,   5],
       [160,  49,  20],
       [167,  60,  12],
       [160,  49,  40],
       [175,  66,  15],
       [188,  70,  15],
       [171,  63,  10],
       [175,  75,  10],
       [172,  65,   3],
       [160,  52,  50],
       [181,  76,  12],
       [175,  60, 100],
       [172,  59,   5],
       [173,  61,   3],
       [170,  60,   7],
       [180,  58,   5]], dtype=int64)

In [49]:
mean = np.mean(features, axis=0)
std = np.std(features, axis=0)

In [50]:
features = (features - mean)/std

In [51]:
label = data['label'].to_numpy()

In [52]:
label

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0], dtype=int64)

In [53]:
def sigmoid(scores):
    return 1 / (1 + np.exp(-scores))

In [54]:
def log_likelihood(features, target, weights):
    scores = np.dot(features, weights)
    ll = np.sum( target*scores - np.log(1 + np.exp(scores)) )
    return ll

In [55]:
def logistic_regression(features, target, num_steps, learning_rate, add_intercept = False):
    if add_intercept:
        intercept = np.ones((features.shape[0], 1))
        features = np.hstack((intercept, features))
        
    weights = np.zeros(features.shape[1])
    
    for step in range(num_steps):
        scores = np.dot(features, weights)
        predictions = sigmoid(scores)

        # Update weights with log likelihood gradient
        output_error_signal = target - predictions
        
        gradient = np.dot(features.T, output_error_signal)
        weights += learning_rate * gradient

        # Print log-likelihood every so often
        if step % 10000 == 0:
            print(log_likelihood(features, target, weights))
        
    return weights

In [56]:
weights = logistic_regression(features, label,
                     num_steps = 50000, learning_rate = 5e-5, add_intercept=True)

-19.39587862868067
-5.147849382555234
-4.1930128118948
-3.846095045774807
-3.6737244375063693


In [57]:
def predict(features, weights):
    global mean
    global std
    features = (features - mean)/std
    intercept = np.ones((features.shape[0], 1))
    features = np.hstack((intercept, features))
    scores = np.dot(features, weights)
    predictions = sigmoid(scores)
    
    return predictions

In [58]:
student1 = np.array([[188, 85, 2]])
print(predict(student1, weights))

[1.51651419e-05]


In [59]:
student2 = np.array([[165, 50, 25]])
print(predict(student2, weights))

[0.81832577]
