pip install pandas
pip install keras
pip install opencv-python
pip install matplotlib

In [5]:
import pandas as pd

# Paths to data files
train_data_path = 'C:\\Users\\alexw\\OneDrive\\Documents\\03_Education\\University\\University_Programming\\Python\\Big_Data\\Coursework\\Datasets\\WIDERFACE\\wider_face_split\\wider_face_train_bbx_gt.txt'
test_data_path = 'C:\\Users\\alexw\\OneDrive\\Documents\\03_Education\\University\\University_Programming\\Python\\Big_Data\\Coursework\\Datasets\\WIDERFACE\\wider_face_split\\wider_face_test_filelist.txt'
val_data_path = 'C:\\Users\\alexw\\OneDrive\\Documents\\03_Education\\University\\University_Programming\\Python\\Big_Data\\Coursework\\Datasets\\WIDERFACE\\wider_face_split\\wider_face_val_bbx_gt.txt'

# Function to process training/validation data
def process_train_val_data(data_path):
    # Read the lines from the text file
    with open(data_path, 'r') as file:
        lines = file.readlines()

    # Initialize lists to store data
    image_paths = []
    num_faces_list = []
    face_info_list = []

    # Process lines to extract information
    for line in lines:
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Check if it represents an image path
        if line.endswith('.jpg'):
            image_paths.append(line)
            # Initialize num_faces and face_info
            num_faces = None
            face_info = []
        else:
            # If it's not an image path, it's face information
            if num_faces is None:
                # Extract the number of faces
                try:
                    num_faces = int(line)
                except ValueError:
                    print(f"Error parsing number of faces in line: {line}")
                    continue
            else:
                # Split face information
                face_info.extend([int(x) for x in line.split()])

            # Check if we have collected enough face information
            if len(face_info) == num_faces * 10:
                num_faces_list.append(num_faces)
                face_info_list.append(face_info)
                if len(image_paths) != len(num_faces_list):
                    print(f"Skipping incomplete data point: {image_paths[-1]}")
                    image_paths.pop()  # Remove the corresponding image path

    # Create a DataFrame
    df = pd.DataFrame({
        'Image_Path': image_paths,
        'Num_Faces': num_faces_list,
        'Face_Info': face_info_list
    })

    return df

# Function to process testing data
def process_test_data(data_path):
    # Read the lines from the text file
    with open(data_path, 'r') as file:
        test_lines = file.readlines()

    # Create a DataFrame with just the image paths
    df = pd.DataFrame({
        'Image_Path': [line.strip() for line in test_lines],
    })

    return df

# Process training data
train_df = process_train_val_data(train_data_path)

# Process testing data
test_df = process_test_data(test_data_path)

# Process validation data
val_df = process_train_val_data(val_data_path)

train_df.to_csv('csv\\train_data.csv', index=False)
test_df.to_csv('csv\\test_data.csv', index=False)
val_df.to_csv('csv\\val_data.csv', index=False)

# Display the first few rows of each DataFrame
print("Training Data:")
print(train_df.head())
print("\nTesting Data:")
print(test_df.head())
print("\nValidation Data:")
print(val_df.head(10))


Training Data:
                                  Image_Path  Num_Faces  \
0  0--Parade/0_Parade_marchingband_1_849.jpg          1   
1        0--Parade/0_Parade_Parade_0_904.jpg          1   
2  0--Parade/0_Parade_marchingband_1_799.jpg         21   
3  0--Parade/0_Parade_marchingband_1_117.jpg          9   
4  0--Parade/0_Parade_marchingband_1_778.jpg         35   

                                           Face_Info  
0             [449, 330, 122, 149, 0, 0, 0, 0, 0, 0]  
1              [361, 98, 263, 339, 0, 0, 0, 0, 0, 0]  
2  [78, 221, 7, 8, 2, 0, 0, 0, 0, 0, 78, 238, 14,...  
3  [69, 359, 50, 36, 1, 0, 0, 0, 0, 1, 227, 382, ...  
4  [27, 226, 33, 36, 1, 0, 0, 0, 2, 0, 63, 95, 16...  

Testing Data:
                                  Image_Path
0  0--Parade/0_Parade_marchingband_1_737.jpg
1  0--Parade/0_Parade_marchingband_1_494.jpg
2        0--Parade/0_Parade_Parade_0_338.jpg
3  0--Parade/0_Parade_marchingband_1_533.jpg
4   0--Parade/0_Parade_marchingband_1_62.jpg

Validation Dat

In [6]:
from sklearn.preprocessing import StandardScaler
import numpy as np

def preprocess_data(df):
    # Handle Missing Values
    # Check for missing values in the DataFrame
    missing_values = df.isnull().sum()
    print("Missing Values:")
    print(missing_values)

    # Normalize Data
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Normalize the Num_Faces column
    df['Num_Faces_Normalized'] = scaler.fit_transform(df[['Num_Faces']])

    # Extract Features (Replace this with your feature extraction method)
    # For demonstration purposes, let's assume we're using a simple count of faces as a feature
    df['Num_Faces_Feature'] = df['Num_Faces']

    # Convert Data into Suitable Format
    # Assuming we're only using the normalized Num_Faces and the extracted feature as input
    X = df[['Num_Faces_Normalized', 'Num_Faces_Feature']]
    y = df['Num_Faces']  # Using 'Num_Faces' as the target variable

    return X, y

# Preprocess Training Data
X_train, y_train = preprocess_data(train_df)
print("Preprocessed Training Data:")
print(X_train.head())
print("Target Variable:")
print(y_train.head())

# Read the lines from the test data file
with open(test_data_path, 'r') as file:
    test_lines = file.readlines()

# Preprocess Testing Data
test_df = pd.DataFrame({
    'Image_Path': test_lines,
    'Num_Faces': np.NaN  # Initialize with NaN values
})

# X_test, y_test = preprocess_data(test_df)
print("Preprocessed Testing Data:")
print(test_df.head())

# Preprocess Validation Data
X_val, y_val = preprocess_data(val_df)
print("Preprocessed Validation Data:")
print(X_val.head())
print("Target Variable:")
print(y_val.head())


Missing Values:
Image_Path    0
Num_Faces     0
Face_Info     0
dtype: int64
Preprocessed Training Data:
   Num_Faces_Normalized  Num_Faces_Feature
0             -0.251883                  1
1             -0.251883                  1
2              0.190897                 21
3             -0.074771                  9
4              0.500844                 35
Target Variable:
0     1
1     1
2    21
3     9
4    35
Name: Num_Faces, dtype: int64
Preprocessed Testing Data:
                                    Image_Path  Num_Faces
0  0--Parade/0_Parade_marchingband_1_737.jpg\n        NaN
1  0--Parade/0_Parade_marchingband_1_494.jpg\n        NaN
2        0--Parade/0_Parade_Parade_0_338.jpg\n        NaN
3  0--Parade/0_Parade_marchingband_1_533.jpg\n        NaN
4   0--Parade/0_Parade_marchingband_1_62.jpg\n        NaN
Missing Values:
Image_Path    0
Num_Faces     0
Face_Info     0
dtype: int64
Preprocessed Validation Data:
   Num_Faces_Normalized  Num_Faces_Feature
0              2.855410  

In [7]:
# Preprocessing Stage. Validates that there are no missing values. Then standardises the data by scaling it. Then applies dimensionality reduction using PCA and TSNE

from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Apply Dimensionality Reduction
def apply_dimensionality_reduction(X):
    # Apply PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)

    # Apply t-SNE
    tsne = TSNE(n_components=2)
    X_tsne = tsne.fit_transform(X)

    return X_pca, X_tsne

# Evaluate Dimensionality Reduction Techniques
def evaluate_dimensionality_reduction(X, X_pca, X_tsne):
    # You can evaluate the techniques using metrics like explained variance, silhouette score, etc.
    # For simplicity, we'll just print out the shape of the transformed data
    print("PCA Transformed Data Shape:", X_pca.shape)
    print("t-SNE Transformed Data Shape:", X_tsne.shape)

# Modify the preprocessing function to incorporate dimensionality reduction
def preprocess_data(df):
    # Handle Missing Values
    # Check for missing values in the DataFrame
    missing_values = df.isnull().sum()
    print("Missing Values:")
    print(missing_values)

    # Normalize Data
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Normalize the Num_Faces column
    df['Num_Faces_Normalized'] = scaler.fit_transform(df[['Num_Faces']])

    # Extract Features (Replace this with your feature extraction method)
    # For demonstration purposes, let's assume we're using a simple count of faces as a feature
    df['Num_Faces_Feature'] = df['Num_Faces']

    # Convert Data into Suitable Format
    # Assuming we're only using the normalized Num_Faces and the extracted feature as input
    X = df[['Num_Faces_Normalized', 'Num_Faces_Feature']]
    y = df['Num_Faces']  # Using 'Num_Faces' as the target variable

    # Apply Dimensionality Reduction
    X_pca, X_tsne = apply_dimensionality_reduction(X)

    # Evaluate Dimensionality Reduction Techniques
    evaluate_dimensionality_reduction(X, X_pca, X_tsne)

    return X, y

# Preprocess Training Data
X_train, y_train = preprocess_data(train_df)
print("Preprocessed Training Data:")
print(X_train.head())
print("Target Variable:")
print(y_train.head())


Missing Values:
Image_Path              0
Num_Faces               0
Face_Info               0
Num_Faces_Normalized    0
Num_Faces_Feature       0
dtype: int64
PCA Transformed Data Shape: (12880, 2)
t-SNE Transformed Data Shape: (12880, 2)
Preprocessed Training Data:
   Num_Faces_Normalized  Num_Faces_Feature
0             -0.251883                  1
1             -0.251883                  1
2              0.190897                 21
3             -0.074771                  9
4              0.500844                 35
Target Variable:
0     1
1     1
2    21
3     9
4    35
Name: Num_Faces, dtype: int64


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train a logistic regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)

# Test the logistic regression model using the test set
X_test_processed, y_test_processed = preprocess_data(test_df)
y_pred_test = logreg_model.predict(X_test_processed)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test_processed, y_pred_test)
print("Test Accuracy:", test_accuracy)

# Evaluate the model on the validation set
X_val_processed, y_val_processed = preprocess_data(val_df)
y_pred_val = logreg_model.predict(X_val_processed)

val_accuracy = accuracy_score(y_val_processed, y_pred_val)
print("Validation Accuracy:", val_accuracy)


Missing Values:
Image_Path        0
Num_Faces     16097
dtype: int64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values