In [1]:
import sys
import os
import pandas
import csv
from sklearn.model_selection import train_test_split
from sklearn import svm, metrics, preprocessing, linear_model, neighbors, ensemble, neural_network
# from "feature_extraction.ipynb" import get_feature_vector
import import_ipynb
from feature_extraction import get_feature_vector
import glob
import fleep
from Image import Image
import ast
import joblib

importing Jupyter notebook from feature_extraction.ipynb
importing Jupyter notebook from Image.ipynb


In [4]:
def create_svm_classifier(csv_file, joblib_file):
    '''
        Train SVM classifier with training set (feature vectors) from csv file,
        and load the trained model in .joblib file
    '''
    training_data = pandas.read_csv(csv_file) #import our training data from the csv file
    
    x, y = training_data.drop(['img_name', 'class'], axis=1), training_data['class']

    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    x = scaler.fit_transform(x) #scale the features in the interval [0:1]
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) #80% for training, 20% for testing
    
    classifier = svm.SVC(kernel = 'poly')
    classifier.fit(x_train, y_train) # fit svm classifier to the train data
    y_prediction = classifier.predict(x_test)
        
#     accuracy = metrics.accuracy_score(y_test, y_prediction) # accuracy of our model: number of correct predictions / number of total predictions
    print("Accuracy on train set: ", classifier.score(x_train, y_train))
    print("Accuracy on test set: ", classifier.score(x_test, y_test))
#     print("Accuracy on test set: ", accuracy)
#     joblib.dump(classifier, joblib_file) # save classifier in the joblib file

In [5]:
def create_knn_classifier(csv_file, joblib_file):
    '''
        Train SVM classifier with training set (feature vectors) from csv file,
        and load the trained model in .joblib file
    '''
    training_data = pandas.read_csv(csv_file) #import our training data from the csv file
    
    x, y = training_data.drop(['img_name', 'class'], axis=1), training_data['class']

    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    x = scaler.fit_transform(x) #scale the features in the interval [0:1]
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) #80% for training, 20% for testing
    
    classifier = neighbors.KNeighborsClassifier(500)
    classifier.fit(x_train, y_train) # fit svm classifier to the train data
    y_prediction = classifier.predict(x_test)
    
    
#     accuracy = metrics.accuracy_score(y_test, y_prediction) # accuracy of our model: number of correct predictions / number of total predictions
    print("K-nearest neighbors classifier: ")
    print("Accuracy on train set: ", classifier.score(x_train, y_train))
    print("Accuracy on test set: ", classifier.score(x_test, y_test))

In [6]:
def create_random_forest_classifier(csv_file, joblib_file):
    '''
        Train SVM classifier with training set (feature vectors) from csv file,
        and load the trained model in .joblib file
    '''
    training_data = pandas.read_csv(csv_file) #import our training data from the csv file
    
    x, y = training_data.drop(['img_name', 'class'], axis=1), training_data['class']

    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    x = scaler.fit_transform(x) #scale the features in the interval [0:1]
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) #80% for training, 20% for testing
    
    classifier = ensemble.RandomForestClassifier(max_depth=8, n_estimators=20, max_features=3)
    classifier.fit(x_train, y_train) # fit svm classifier to the train data
    y_prediction = classifier.predict(x_test)
    
    
#     accuracy = metrics.accuracy_score(y_test, y_prediction) # accuracy of our model: number of correct predictions / number of total predictions
    print("Random Forests classifier: ")
    print("Accuracy on train set: ", classifier.score(x_train, y_train))
    print("Accuracy on test set: ", classifier.score(x_test, y_test))

In [7]:
def create_mlp_classifier(csv_file, joblib_file):
    '''
        Train SVM classifier with training set (feature vectors) from csv file,
        and load the trained model in .joblib file
    '''
    training_data = pandas.read_csv(csv_file) #import our training data from the csv file
    
    x, y = training_data.drop(['img_name', 'class'], axis=1), training_data['class']

    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    x = scaler.fit_transform(x) #scale the features in the interval [0:1]
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) #80% for training, 20% for testing
    
    classifier = neural_network.MLPClassifier(alpha=1, max_iter=1000)
    classifier.fit(x_train, y_train) # fit svm classifier to the train data
    y_prediction = classifier.predict(x_test)
    
    
#     accuracy = metrics.accuracy_score(y_test, y_prediction) # accuracy of our model: number of correct predictions / number of total predictions
    print("MLP NN classifier: ")
    print("Accuracy on train set: ", classifier.score(x_train, y_train))
    print("Accuracy on test set: ", classifier.score(x_test, y_test))

In [8]:
def extract_update_features(image):
    '''
        Get feature vector of image object,
        return image object with image.features updated
    '''
    feature_vector = get_feature_vector(image.name)
    farid_r, farid_g, farid_b = feature_vector[0], feature_vector[1], feature_vector[2]
    
    farid_dict = {} # will have every feature in every channel
    
    counter = 1
    for feature_value in farid_r:
        feature_name = 'farid_r_{}'.format(counter)
#         farid_dict[feature_name] = float(feature_value)
        farid_dict[feature_name] = float(feature_value)
        counter = counter + 1
    counter = 1
    for feature_value in farid_g:
        feature_name = 'farid_g_{}'.format(counter)
        farid_dict[feature_name] = float(feature_value)
        counter = counter + 1
    counter = 1
    for feature_value in farid_b:
        feature_name = 'farid_b_{}'.format(counter)
        farid_dict[feature_name] = float(feature_value)
        counter = counter + 1
    
    image.features.update(farid_dict)
    
    return image

In [9]:
def extract_from_images_list(img_list):
    '''
        Updates image.feature for a list of images
    '''
    for image in img_list:
        image = extract_update_features(image)
    return img_list

In [10]:
def get_img_lists(dir_location):
    img_names = glob.glob("{}/*".format(dir_location)) # gets list of all images in image directory (/stego or /clean)
    img_list = [] # list of image objects
    for img_name in img_names:
#         if find_file(file_name):  # try to find file, if file can be found:
        img_extension = get_img_extension(img_name)
        img_size = os.path.getsize(img_name)
        new_image = Image(img_name, img_extension, img_size)  # create Image object
#         img_extension = get_img_extension(img_name)  # get file type and file extension
#         img_size = os.path.getsize(img_name)  # get file size
#         new_image.update_img(img_extension, img_size)  # update new_file with new info
        img_list.append(new_image)  # add image object to image list
    return img_list
dire = os.getcwd()+'\\images'
# get_img_lists(dire)

In [11]:
def get_img_extension(img_name):
#     with open(img_name, 'rb') as img:
#         img_info = fleep.get(img.read(128))
#         img_extension = img_info.extension[0]
#         img.close()
    name, img_extension = os.path.splitext(img_name)
    return img_extension


In [12]:
def extract_features(dir_location):
    '''
        return two lists of image objects in '/stego' and '/clean' directories with updated features
    '''
    # get image object list for stego and clean
    stego_images = get_img_lists("{}/stego-reduced".format(dir_location))
    clean_images = get_img_lists("{}/clean-reduced".format(dir_location))
    
    #extract and update features for both lists
    stego_with_features = extract_from_images_list(stego_images)
    clean_with_features = extract_from_images_list(clean_images)
    
    return stego_with_features, clean_with_features

In [13]:
def write_img_to_csv(stego_with_features, clean_with_features):
    output_file = 'img-features.csv' # our csv output file

    list_of_dicts = []
    feature_types = []
    
    for img in stego_with_features:
        temp_dict = {}
        temp_dict['img_name'] = img.name
        for feature_type, feature_list in img.features.items():
            temp_dict[feature_type] = feature_list
            if feature_type not in feature_types:
                feature_types.append(feature_type)
        temp_dict['class'] = 1
        list_of_dicts.append(temp_dict)
    
    for img in clean_with_features:
        temp_dict = {}
        temp_dict['img_name'] = img.name
        for feature_type, feature_list in img.features.items():
            temp_dict[feature_type] = feature_list
            if feature_type not in feature_types:
                feature_types.append(feature_type)
        temp_dict['class'] = 0
        list_of_dicts.append(temp_dict)
    
    with open(output_file, 'w', newline='') as csv_file:
        # set fieldnames
        fieldnames = ['img_name']
        for feature_type in feature_types:
            fieldnames.append(feature_type)
        fieldnames.append('class')
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for dict_item in list_of_dicts:
            writer.writerow(dict_item)
    
    

In [14]:
def train_model(dir_location):
    '''
        Extrat features, create, train, and test our model
    '''
    directory = "{}/images".format(dir_location)
    
    stego_with_features, clean_with_features = extract_features(directory)
    
    write_img_to_csv(stego_with_features, clean_with_features)
    
    create_svm_classifier('img-features.csv', 'img-svm.joblib')

In [15]:
# train_model(os.getcwd())
create_svm_classifier('img-features.csv', 'img-svm.joblib')
# create_knn_classifier('img-features.csv', 'img-svm.joblib')
# create_random_forest_classifier('img-features.csv', 'img-svm.joblib')
# create_mlp_classifier('img-features.csv', 'img-svm.joblib')



Accuracy on train set:  0.5005202913631633
Accuracy on test set:  0.49480249480249483


In [30]:
import sklearn
loaded_model = joblib.load("img-lr-jessica.joblib")

training_data = pandas.read_csv('img-features.csv') #import our training data from the csv file
x, y = training_data.drop(['img_name', 'class'], axis=1), training_data['class']

scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
x = scaler.fit_transform(x) #scale the features in the interval [0:1]
    
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) #80% for training, 20% for testing

print("Accuracy on test set: ", loaded_model.score(x_test, y_test))

Accuracy on test set:  0.5031185031185031




In [107]:
import sys
!{sys.executable} -m pip install scikit-learn=0.19.2

ERROR: Invalid requirement: 'scikit-learn=0.19.2'
Hint: = is not a valid operator. Did you mean == ?


In [34]:
training_data = pandas.read_csv('img-features.csv') #import our training data from the csv file
    
x, y = training_data.drop(['img_name', 'class'], axis=1), training_data['class']
x

Unnamed: 0,farid_r_1,farid_r_2,farid_r_3,farid_r_4,farid_r_5,farid_r_6,farid_r_7,farid_r_8,farid_r_9,farid_r_10,...,farid_b_27,farid_b_28,farid_b_29,farid_b_30,farid_b_31,farid_b_32,farid_b_33,farid_b_34,farid_b_35,farid_b_36
0,-0.117610,931.664708,0.196375,2.580318,-13.333019,10931.936584,-0.081772,1.409096,0.006145,113.285946,...,0.300753,14.744447,-0.039054,3443.002789,-0.435233,0.059799,1.170623e-05,46.070723,-0.063086,7.877356
1,-0.241478,1914.863894,-0.122600,2.229304,-19.946512,19668.566825,0.078170,-0.738696,0.012616,232.841788,...,-0.175333,8.317309,-0.057065,6188.807620,-0.356140,-1.064673,2.118808e-05,52.761690,0.038872,3.939216
2,0.035054,201.088956,0.657866,23.197017,-5.244294,2675.208450,-0.269379,3.740757,-0.001831,24.451241,...,0.659413,48.990810,-0.050558,5143.925948,-0.373919,-0.849749,6.487192e-06,4.248978,-0.138656,28.868942
3,-0.161205,491.256853,0.020983,15.069482,-12.773164,12029.228004,-0.145996,1.751877,0.008422,59.736691,...,-0.128606,70.671139,-0.044134,4568.487387,-0.448853,0.218712,1.123088e-05,6.535435,0.029716,42.165031
4,-0.160191,281.236138,0.138394,29.563047,-7.429747,3447.779958,-0.162200,3.894538,0.008369,34.199515,...,-0.364053,331.371977,-0.018899,934.762252,-0.564545,2.759619,3.203432e-07,1.181646,0.077636,201.991284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2398,-0.122719,571.887975,-0.290372,19.482851,-7.598255,6452.785644,-0.329063,4.350538,0.006412,69.539607,...,-0.543838,120.992250,-0.020872,1902.841016,-0.641468,3.478047,1.868390e-05,4.949605,0.120218,73.017074
2399,-0.015311,190.819437,0.272703,41.541720,-4.802315,1969.063531,-0.234632,3.117024,0.000800,23.202416,...,0.115036,74.247965,-0.021430,2302.213828,-0.685358,4.134864,-1.641496e-06,7.076301,-0.024812,44.357991
2400,-0.199249,305.342124,0.099720,14.978776,-20.411398,23256.686289,0.014610,-0.218370,0.010410,37.132313,...,-0.226035,39.343578,-0.048312,4622.728977,-0.375460,-0.797586,9.397364e-06,5.321953,0.050280,22.960613
2401,-0.148812,232.047080,0.156240,31.993590,-10.402956,6524.482399,-0.064958,1.184954,0.007775,28.218031,...,0.229299,58.944709,-0.046596,4454.519837,-0.382499,-0.735140,2.113706e-05,8.136546,-0.044840,34.969698
