#### 1.SVM Classifier for Wisconsin Breast Cancer Diagnosis

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:

file_path=r"breast-cancer-wisconsin.csv"

df=pd.read_csv(file_path)

In [4]:
#Display the first column
print(df.iloc[:,0].head())

0    1000025
1    1002945
2    1015425
3    1016277
4    1017023
Name: Id, dtype: int64


In [5]:
#Dropping the 'id' column as non-informative
df=df.drop(columns=['Id'])

In [6]:
#Strip leading  and trailing  spaces from column names
df.columns= df.columns.str.strip()

In [7]:
#Verify the datatype
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Clump Thickness              699 non-null    int64 
 1   Uniformity of Cell Size      699 non-null    int64 
 2   Uniformity of Cell Shape     699 non-null    int64 
 3   Marginal Adhesion            699 non-null    int64 
 4   Single Epithelial Cell Size  699 non-null    int64 
 5   Bare Nuclei                  699 non-null    object
 6   Bland Chromatin              699 non-null    int64 
 7   Normal Nucleoli              699 non-null    int64 
 8   Mitoses                      699 non-null    int64 
 9   Class                        699 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 54.7+ KB


In [8]:
# Convert 'Bare Nuclei' to numeric, setting errors='coerce' to handle '?' as NaN
df['Bare Nuclei'] = pd.to_numeric(df['Bare Nuclei'], errors='coerce')

In [9]:
#Drp rows with any missiong values
df=df.dropna()

In [10]:
#separate features and target 
X =df.drop(columns=['Class'])
y=df['Class']


In [11]:
#Split the data into trainig and test sets
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
#Standardize the features
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [13]:
#Train  SVM Classifier
svm_classifier=SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)


In [14]:
#predictions
y_pred=svm_classifier.predict(X_test)

In [15]:
#Evaluate the model
accuracy =accuracy_score(y_test, y_pred)
report=classification_report(y_test, y_pred)

In [16]:
#Results
print("Accuracy:", accuracy)
print ("Clasification Report:\n", report)

Accuracy: 0.9708029197080292
Clasification Report:
               precision    recall  f1-score   support

           2       0.96      0.99      0.97        79
           4       0.98      0.95      0.96        58

    accuracy                           0.97       137
   macro avg       0.97      0.97      0.97       137
weighted avg       0.97      0.97      0.97       137



#### 2. SVM Regression for the Tips Data Set


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [18]:

file_path = 'tips.csv'
tips_df=pd.read_csv(file_path)

In [19]:
#features&target
X= tips_df.drop(columns=['tip'])
y=tips_df["tip"]

In [20]:

print(tips_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB
None


In [21]:
# Selectează coloanele numerice
numeric_features = tips_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Selectează coloanele categorice
categorical_features = tips_df.select_dtypes(include=['object']).columns.tolist()

print("Coloane numerice:", numeric_features)
print("Coloane categorice:", categorical_features)


Coloane numerice: ['total_bill', 'tip', 'size']
Coloane categorice: ['sex', 'smoker', 'day', 'time']


In [22]:
numeric_features = ["total_bill", "size"]
categorical_features = ["sex", "smoker", "day", "time"]

In [23]:
# Preprocess data: scale numeric features and one-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

In [24]:
# Create a pipeline with preprocessing and an SVM regressor
svm_regression_pipeline= Pipeline([
    ("preprocessor", preprocessor),
    ("svr", SVR())
])


In [25]:
# Define a parameter grid for hyperparameter tuning
param_grid = {
    'svr__C': [0.1, 1, 10, 100],
    'svr__epsilon': [0.1, 0.2, 0.5, 1],
    'svr__kernel': ['linear', 'rbf', 'poly']
}

In [26]:
# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(svm_regression_pipeline, param_grid, cv=5, scoring='r2')
grid_search.fit(X, y)

In [27]:
# Print the best parameters from GridSearchCV
print("Best parameters:", grid_search.best_params_)

Best parameters: {'svr__C': 0.1, 'svr__epsilon': 0.2, 'svr__kernel': 'linear'}


In [28]:
#Split the data
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=42 )



In [29]:
# Train the best SVM model on the training data
best_svm_model = grid_search.best_estimator_
best_svm_model.fit(X_train, y_train)

In [30]:
#Prediction
y_pred = best_svm_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

In [31]:
# Print the mean squared error and R^2
print(f"MSE: {mse}")
print(f"R2: {r_squared}")

MSE: 0.6539107347086293
R2: 0.47685941987260916


#### 3. SVM Regression for the music-sub.csv dataset
<font color = 'orange'> Nu-mi merge mie, efectiv se blocheaza la fit the model </font>


In [32]:
import pandas as pd

# Load the dataset
data = pd.read_csv('music-sub.csv')

# Display the first few rows of the dataset
print(data.head())


      Unnamed: 0 Artist  Type        LVar       LAve   LMax     LFEner  \
0  Dancing Queen   Abba  Rock  17600755.6 -90.006867  29921  105.92095   
1     Knowing Me   Abba  Rock   9543020.9 -75.766719  27626  102.83616   
2  Take a Chance   Abba  Rock   9049481.5 -98.062924  26372  102.32488   
3      Mamma Mia   Abba  Rock   7557437.3 -90.471062  28898  101.61648   
4    Lay All You   Abba  Rock   6282285.6 -88.952631  27940  100.30076   

       LFreq  
0   59.57379  
1   58.48031  
2  124.59397  
3   48.76513  
4   74.02039  


In [33]:
data.head()

Unnamed: 0.1,Unnamed: 0,Artist,Type,LVar,LAve,LMax,LFEner,LFreq
0,Dancing Queen,Abba,Rock,17600755.6,-90.006867,29921,105.92095,59.57379
1,Knowing Me,Abba,Rock,9543020.9,-75.766719,27626,102.83616,58.48031
2,Take a Chance,Abba,Rock,9049481.5,-98.062924,26372,102.32488,124.59397
3,Mamma Mia,Abba,Rock,7557437.3,-90.471062,28898,101.61648,48.76513
4,Lay All You,Abba,Rock,6282285.6,-88.952631,27940,100.30076,74.02039


In [34]:
data['Title']=data['Unnamed: 0']
data.drop(columns='Unnamed: 0', inplace=True)

In [35]:
print(data.isnull().sum())


Artist    0
Type      0
LVar      0
LAve      0
LMax      0
LFEner    0
LFreq     0
Title     0
dtype: int64


In [36]:
data.columns

Index(['Artist', 'Type', 'LVar', 'LAve', 'LMax', 'LFEner', 'LFreq', 'Title'], dtype='object')

In [37]:
new_order = ['Title'] + [col for col in data.columns if col != 'Title']
df = data[new_order]
df.head()

Unnamed: 0,Title,Artist,Type,LVar,LAve,LMax,LFEner,LFreq
0,Dancing Queen,Abba,Rock,17600755.6,-90.006867,29921,105.92095,59.57379
1,Knowing Me,Abba,Rock,9543020.9,-75.766719,27626,102.83616,58.48031
2,Take a Chance,Abba,Rock,9049481.5,-98.062924,26372,102.32488,124.59397
3,Mamma Mia,Abba,Rock,7557437.3,-90.471062,28898,101.61648,48.76513
4,Lay All You,Abba,Rock,6282285.6,-88.952631,27940,100.30076,74.02039


In [38]:
df.columns

Index(['Title', 'Artist', 'Type', 'LVar', 'LAve', 'LMax', 'LFEner', 'LFreq'], dtype='object')

In [39]:
from sklearn.preprocessing import LabelEncoder

# Drop 'Title' since it's not a feature for prediction
data = data.drop(columns=['Title'])

# Encode categorical variables
label_encoders = {}
for column in ['Artist', 'Type']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le  # Store the label encoder for potential inverse transformation later

# Display the modified DataFrame
print(data.head())


   Artist  Type        LVar       LAve   LMax     LFEner      LFreq
0       0     2  17600755.6 -90.006867  29921  105.92095   59.57379
1       0     2   9543020.9 -75.766719  27626  102.83616   58.48031
2       0     2   9049481.5 -98.062924  26372  102.32488  124.59397
3       0     2   7557437.3 -90.471062  28898  101.61648   48.76513
4       0     2   6282285.6 -88.952631  27940  100.30076   74.02039


In [40]:
X = data.drop(columns=['Type'])  # Features
y = data['Type']  # Target variable


In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime

# Start measuring time
start_time = datetime.now()
print(f"Started at: {start_time}")

# Create an SVM classifier
model = SVC(kernel='linear')  # You can also try other kernels like 'rbf', 'poly', etc.

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# End measuring time
end_time = datetime.now()
print(f"Ended at: {end_time}")

# Calculate and print the duration
print(f"Duration of the training and predictions: {end_time - start_time}")
# aici se blocheaza for some fucking reason incearca si tu

Started at: 2024-10-25 15:44:30.608323


In [None]:
# Print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
