In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


# loaddata
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data", header = None)

# check data
df.head()
df.tail()
df.info()
df.describe()
df.columns
df.shape

X = df.drop(60, axis=1)
y = df[60]


# hand miss value mean均值，median中位数，most_frequent众数
df.isnull().sum()
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)
#X= pd.DataFrame(X, columns=X.columns)


# LabelEncoder or onehot encoder(labelencoder-分类有序, onehot-多个，无序)
encoder = preprocessing.LabelEncoder()
encoder = preprocessing.OneHotEncoder()
y= encoder.fit_transform(y)

# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
print("Train_X:", X_train.shape)
print("Train_y:", y_train.shape)
print("Test_X:", X_test.shape)
print("Test_y:", y_test.shape)

# minmax and standardscaler
scaler = preprocessing.StandardScaler()
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# PCA and SVD and vec
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
svd = TruncatedSVD(n_components=50)
X_train = vec.fit_transform(X_train)
X_train = svd.fit_transform(X_train)



# pipeline and gridsearch and knn or decision tree
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("vectorizer", CountVectorizer()),
    ("svd", TruncatedSVD()),
    ("rf", RandomForestClassifier()),
    ("dt", DecisionTreeClassifier()),
    ("knn", KNeighborsClassifier())])

# Create a dictionary of hyperparameters for the pipeline with the KNN classifier
params = {"vectorizer__stop_words": [None, "english"],
          "vectorizer__ngram_range": [(1, 1), (1, 2)],
          "svd__n_components": [100, 200],
          "rf__n_estimators": [10, 50, 100],
          "rf__criterion": ["gini", "entropy", "log_loss"],
          "dt__criterion": ["gini", "entropy"],
          "knn__n_neighbors": [3,5,7,9,10, 15, 20, 25, 30]}

gs = GridSearchCV(pipeline, params, scoring="accuracy", cv=5) # scoring="f1"
gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.best_score_)

pipeline.set_params(**gs.best_params_)
pipeline.fit(X_train, y_train)
accuracy_score(y_test, pipeline.predict(X_test))

# 应用到测试集
best_model = gs.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)

In [None]:
#Data pre-processing
'''
  - Dealing with Missing values
  - Scaling Data
  - Handling Categorical Data
  - Feature Selection
  - Outlier detection
  - Dimensionality Reduction
'''

import pandas as pd
from sklearn import neighbors
from sklearn import metrics
from sklearn import model_selection
import matplotlib.pyplot as plt

df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header = None)
df.columns = ["Class", "Alcohol", "Malic acid", "Ash" , "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]

features = df.loc[:, "Alcohol": "Proline"]
labels = df["Class"]

train_features, test_features, train_labels, test_labels = model_selection.train_test_split(features, labels, test_size = 0.2, random_state = 0)

# grid search and knn
knn = neighbors.KNeighborsClassifier()
parameters = {'n_neighbors':[1, 3, 5, 7, 11]}

clf = model_selection.GridSearchCV(knn, parameters)
clf.fit(train_features, train_labels)

print("The best classifier is:", clf.best_estimator_)
print("The accuracy is: ", clf.best_score_)
print("Its parameters are:", clf.best_params_)


from sklearn import tree

dtc = tree.DecisionTreeClassifier()
parameters = {'criterion':["entropy", "gini"],'max_depth':[2, 3, 4], 'min_impurity_decrease':[0.01, 0.1, 0.2]}

clf = model_selection.GridSearchCV(dtc, parameters, cv=10)
clf.fit(train_features, train_labels)

print("The best classifier is:", clf.best_estimator_)
print("The accuracy is: ", clf.best_score_)
print("Its parameters are:", clf.best_params_)

clf.best_estimator_.score(test_features, test_labels)



# PCA
from sklearn import neighbors
from sklearn import metrics
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.datasets import load_breast_cancer


data = load_breast_cancer()
train_features, test_features, train_labels, test_labels = model_selection.train_test_split(data.data, data.target, test_size=0.2, random_state=10)

scaler = MinMaxScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)
print("Training set size ", train_features.shape)

pca = PCA(n_components=4)
train_features = pca.fit_transform(train_features)
test_features = pca.transform(test_features)
print("Training set size after PCA", train_features.shape)


knn = neighbors.KNeighborsClassifier()
parameters = {'n_neighbors': [1, 3, 5, 7, 11]}
clf = model_selection.GridSearchCV(knn, parameters)
clf.fit( train_features, train_labels)
print("The best classifier is:", clf.best_estimator_)
print("Its accuracy is:",clf.best_score_)
print("Its parameters are:",clf.best_params_)

# using Pipelines
from sklearn import neighbors
from sklearn import metrics
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline

data = load_breast_cancer()
train_features, test_features, train_labels, test_labels = model_selection.train_test_split(data.data, data.target, test_size=0.2, random_state=10)

pipe_lr = Pipeline([('scl', MinMaxScaler()), ('dr', PCA(8)), ('clf',neighbors.KNeighborsClassifier(5))])

pipe_lr.fit(train_features, train_labels)

predictedResults = pipe_lr.predict(test_features)
print(metrics.accuracy_score(predictedResults, test_labels))
# Alternatively we could substitute this line (instead of the last two lines)
print('Test Accuracy:', pipe_lr.score(test_features, test_labels))



#using pipelines for cross fold validation
from sklearn.model_selection import cross_val_score
results = cross_val_score(pipe_lr, train_features, train_labels, cv=10)
print(results.mean())


# LabNotes

## Lab07

In [None]:
# Consider the Sonar dataset: http://archive.ics.uci.edu/ml/datasets/connectionist+bench+(sonar,+mines+vs.+rocks)

# This dataset is used for binary classification, the last column is the target.

# Task:
# 1.Load the dataset
import pandas as pd
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data", header = None)

df[60] # this is the label

# 2.Use a label encoder to encode the target.
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df[60]= encoder.fit_transform(df[60])

# 3.Train-Test split 80-20. Use 42 as random seed.
from sklearn.model_selection import train_test_split
X = df.drop(60, axis=1)
y = df[60]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# 4.Train a Decision Tree on the dataset and report the accuracy result.
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_train, y_train)

prediction = tree_classifier.predict(X_test)
print(accuracy_score(y_test, prediction))
# 0.6428571428571429

# 5.Improve the Decision Tree definining a parameter grid and using Grid Search.
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1,2,3,4],
    'min_samples_leaf': [1, 2],
}

searcher = GridSearchCV(tree_classifier,param_grid)

searcher.fit(X_train, y_train)

best_params = searcher.best_params_

print("Best parameters found: ", best_params)
best_classifier = searcher.best_estimator_

prediction = best_classifier.predict(X_test)
print(accuracy_score(y_test, prediction))
# Best parameters found:  {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 2} 0.6666666666666666

# 6.Test a different classification model, the SVC: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC

SVC_classifier = SVC(random_state=42)
SVC_classifier.fit(X_train, y_train)

prediction = SVC_classifier.predict(X_test)
print(accuracy_score(y_test, prediction))
# 0.8333333333333334
# 7.Which has a higher accuracy?

## lab 08

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. dataset
df = pd.read_csv("https://github.com/andvise/DataAnalyticsDatasets/blob/8e8f6475f49d2a587e4f5c76cdf0b011b22c6ac1/dataset_5000_reviews.csv?raw=true")

df.head()
df.tail()

df['Sentiment'].value_counts()

# 2.preprocessing
# labencoder
from sklearn.preprocessing import LabelEncoder

X = df['Review']
y = df['Sentiment']

encoder = LabelEncoder()
y = encoder.fit_transform(y)

# split dataset into training set and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print("Train_X:", X_train.shape)
print("Train_y:", y_train.shape)
print("Test_X:", X_test.shape)
print("Test_y:", y_test.shape)

# 3. Classification Task

'''
Create a machine learning approach using Count Vectorizer and KNN Classifier.
Use the given parameters grid and GridSearchCV to find the optimal set
Fit the the best model on the full training set.
Evaluate its performance on the test set.
Assess if adding TruncatedSVD (feature extraction) is improving the performance
'''

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn import model_selection
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()

svd = TruncatedSVD(n_components=50)

X_train = vec.fit_transform(X_train) # applying Bow
X_train = svd.fit_transform(X_train) # applying SVD - > similar to PCA and better for sparse data

knn = KNeighborsClassifier()
parameters = {'n_neighbors': [1, 3, 5],
              'p': [1,2]}
clf = model_selection.GridSearchCV(knn, parameters)
clf.fit(X_train, y_train)

print("The best classifier is:", clf.best_estimator_)
print("Its accuracy is:",clf.best_score_)
print("Its parameters are:",clf.best_params_)

X_test = vec.transform(X_test)
X_test = svd.transform(X_test)
clf.best_estimator_.score(X_test, y_test)

from sklearn.metrics import accuracy_score
classifier = KNeighborsClassifier(n_neighbors=5, p=1)
classifier.fit(X_train, y_train)
y1 = classifier.predict(X_test)

print(accuracy_score(y_test, y1))

Train_X: (4000,)
Train_y: (4000,)
Test_X: (1000,)
Test_y: (1000,)
The best classifier is: KNeighborsClassifier(p=1)
Its accuracy is: 0.609
Its parameters are: {'n_neighbors': 5, 'p': 1}


0.608

# Lecture notes

In [None]:
# L2pandas
#df.attribute(dtypes、columns、shape、values、...）

# L3 preprocessing in pandas
# 1.data frame methods
#   df.head(3), df.tail(2), df.describe(), max(), min(), std(), dropna()
# 2. missing values
#   dropna(), dropna(how="all"),dropna(axis=1, how="all"), fillna(0), isnull(), notnull()
# 3. data selection
#   iloc: df.iloc[0], df.iloc[-1], df.iloc[:,0],df.iloc[:, 0:2], df.iloc[1:3, 0:2], df.iloc[[0,5],[1,3]]
# 4. groupby
#   total sum: df.groupby('rank)['salary].sum()
#   max and min: df.groupby('rank)['salary].max()/min()
#   count number of elements: df.groupby('rank)['salary].count()
#   average: df.groupby('rank)['salary].mean()
#   median: df.groupby('rank)['salary].median()
# 5. aggregation function
#   min, max, count, sum, prod, mean, median

# L5 - KNN
# minmax and standard normalization
from sklearn import preprocessing
import numpy as np
customer = np.array([[35, 35, 3],[22, 50, 2],[63, 200, 1], [59, 170, 1], [25, 40, 4]])
# scalingObj = preprocessing.MinMaxScaler()
scalingObj = preprocessing.StandardScaler()
new_customer = scalingObj.fit_transform(customer)
david = np.array([[37, 50, 2]])
new_david = scalingObj.transform(david)


# L10-11 grid, cross-validation, knn, decision tree, confusion matrix
import pandas as pd
from sklearn import neighbors
from sklearn import metrics
from sklearn import model_selection
import matplotlib.pyplot as plt

df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header = None)
df.columns = ["Class", "Alcohol", "Malic acid", "Ash" , "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]

features = df.loc[:, "Alcohol": "Proline"]
labels = df["Class"]

# train-test split
train_features, test_features, train_labels, test_labels = model_selection.train_test_split(features, labels, test_size = 0.2, random_state = 0)

# grid search
knn = neighbors.KNeighborsClassifier(n_neighbors=5)


# train the knn
knn.fit(train_features, train_labels)

predictions = knn.predict(test_features)

#confusion matrix print
cm = metrics.confusion_matrix(test_labels, predictions, labels = knn.classes_)

disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn.classes_)

#disp.plot()
#plt.show()

# cross validataion of the metrics
scores = model_selection.cross_val_score(knn, train_features, train_labels, cv=10)
print(scores)

# print mean and sd
print("Accuracy: %.2f,  Standard deviation of: %.2f" % (scores.mean(), scores.std()))

# Decision tree parameters
#criterion
#splitter
#max_depth
#min_samples_split
#min_samples_leaf
#min_weight_fraction_leaf

from sklearn import tree

dtc = tree.DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=5)
dtc = dtc.fit(train_features, train_labels)
dtc.score(test_features, test_labels)

# print tree
plt.figure(figsize=(10, 10)) #Resize figure
tree.plot_tree(dtc, feature_names= features.columns, filled = True, rounded = True)

#confusion matrix
predictions = dtc.predict(test_features)
cm = metrics.confusion_matrix(test_labels, predictions, labels = dtc.classes_)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dtc.classes_)
disp.plot()
plt.show()


# grid search Decision tree
from sklearn import tree

dtc = tree.DecisionTreeClassifier()
parameters = {'criterion':["entropy", "gini"],'max_depth':[2, 3, 4], 'min_impurity_decrease':[0.01, 0.1, 0.2]}

clf = model_selection.GridSearchCV(dtc, parameters, cv=10)
clf.fit(train_features, train_labels)

print("The best classifier is:", clf.best_estimator_)
print("The accuracy is: ", clf.best_score_)
print("Its parameters are:", clf.best_params_)

clf.best_estimator_.score(test_features, test_labels)


# l10

import pandas as pd
seriesA = pd.Series(['A', 'C','B'])
seriesB = pd.Series([21, 18, 19])
seriesC = pd.Series([4, 1, 1])
seriesD = pd.Series(['Computing', 'Biology','Chemistry'])

df = pd.DataFrame({'Grade': seriesA, 'Age': seriesB, 'DegreeYear':seriesC, 'Department': seriesD})


grade_mapping = {'F':0, 'D':1, 'C':2, 'B':3, 'A':4}
df['Grade'] = df['Grade'].map(grade_mapping)


# label encoding
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df['Department'] = class_le.fit_transform(df['Department'].values)
print(df)


# stop-words, count vectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
tweets = [
"No one is born hating another person because of the color of his skin or his background or his religion.",
"People must learn to hate, and if they can learn to hate, they can be taught to love.",
"For love comes more naturally to the human heart than its opposite."
]

# create the vectorizer

vectorizer = CountVectorizer(stop_words = 'english')

# run the vectorizer
vectorizer.fit(tweets)
X = vectorizer.transform(tweets)
print(X)




