In [1]:
import pandas as pd 
import numpy as np
from langchain_groq.chat_models import ChatGroq

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [1]:
from dotenv import load_dotenv
import os

# Load the environment variables from the .env file
load_dotenv()

# Access the API key
Groq_token = os.getenv('GROQ_API_KEY')

groq_models = {"llama3-70b": "llama3-70b-8192", "mixtral": "mixtral-8x7b-32768", "gemma-7b": "gemma-7b-it","llama3.1-70b":"llama-3.1-70b-versatile","llama3-8b":"llama3-8b-8192","llama3.1-8b":"llama-3.1-8b-instant","gemma-9b":"gemma2-9b-it"}

**NOTE : DO NOT SHARE THE API KEY WITH ANYONE. DO NOT COMMIT THE API KEY TO GITHUB.**

Always do a sanity check before committing the code to github. If the key is found in the code, you will be penalized with a 0.5 marks deduction.

# Zero Shot 

In [3]:
"""Demonstrate how to use Zero-Shot Learning and Few-Shot Learning to classify human activities 
based on the featurized accelerometer data"""

# Load the dataset
data = pd.read_csv("data.csv").iloc[:90000:100,:]

X= data.drop(['subject','activity'],axis=1)
y= data['activity']
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=42)

"""Zero Shot Learning"""

# System Prompts 
query = f"""
* You are a machine learning classsifier model(Real Input Discrete output). 
* Based on the accerlerometer values(accx, accy, accz) you have to predict the human activity.
* Activities can be among the following: Walking, Walking_Upstairs, Walking_Downstairs, Sitting, Standing, Laying.
* activity_labels = ["WALKING":1,"WALKING_UPSTAIRS":2,"WALKING_DOWNSTAIRS":3,"SITTING":4,"STANDING":5,"LAYING":6]
* The dataset is in the following format: {X_train}
* You have predict the human activity for every row in the X_train dataset.
* Just give the prediction array for the given dataset without any explanation or anything above or below it.
* Ensure that the predicted array is of the length = {len(X_train)} only.
""" 

# To use Groq LLMs 
model_name = "llama3.1-70b" # We can choose any model from the groq_models dictionary
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)
zero_shot_answer = llm.invoke(query).content

In [4]:
# Acccuracy of Zero Shot Learning

str= zero_shot_answer.strip("[]").split(",")
str_not_null= [i for i in str if i !=" "]
y_pred= np.array([int(i) for i in str_not_null])

zero_shot_accuracy_score = accuracy_score(y_train, y_pred)

"""The accuracy of the Zero Shot Learning model could not be calculated since the model did not 
provide the prediction array in the correct format."""

ValueError: Found input variables with inconsistent numbers of samples: [630, 2667]

# Few Shot

In [5]:
# Load the dataset
data = pd.read_csv("data.csv").iloc[:90000:100,:]

X= data.drop(['subject','activity'],axis=1)
y= data['activity']
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=42)

"""Few Shot Learning"""

# System Prompts 
query = f"""
* You are a machine learning classsifier model(Real Input Discrete output). 
* Based on the accerlerometer values(accx, accy, accz) you have to predict the human activity.
* Activities can be among the following: Walking, Walking_Upstairs, Walking_Downstairs, Sitting, Standing, Laying.
* activity_labels = ["WALKING":1,"WALKING_UPSTAIRS":2,"WALKING_DOWNSTAIRS":3,"SITTING":4,"STANDING":5,"LAYING":6]
* You have predict the human activity for every row input in the X_test dataset and output the corresponding activity_label.

* You have been trained on the following dataset:
* Training Dataset: {X_train}
* Training Labels: {y_train}

* The test dataset is in the following format: {X_test}

*PS: Only give out the prediction array for the given dataset without any explanation without anything above or below it.
*PS: Ensure that the prediction array is of the same length as the test dataset.
""" 

# To use Groq LLMs 
model_name = "llama3-70b" # We can choose any model from the groq_models dictionary
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)
few_shot_answer = llm.invoke(query).content

In [6]:
""" Acccuracy of Few Shot Learning"""

str= few_shot_answer.strip("[]").split(",")
str_not_null= [i for i in str if i !=" "]
y_pred= np.array([int(i) for i in str_not_null])

few_shot_accuracy_score = accuracy_score(y_test, y_pred)

"""The accuracy of the Few Shot Learning model could not be calculated since the model did not 
provide the prediction array in the correct format."""

ValueError: Found input variables with inconsistent numbers of samples: [270, 2481]

Q. Qualitatively demonstrate the performance of Few-Shot Learning with Zero-Shot Learning. Which method performs better? Why?
In general, Few-Shot Learning should perform better than Zero-Shot Learning because it has seen some examples of the target classes, allowing it to adapt better to the specific task. However, the performance difference can vary depending on the complexity of the task and the quality of the few-shot examples provided. We should take care of the problem of class imbalances as Few Shot Learning is prone to Overfitting.

In [36]:
"""Compare Few Shot accuracy with Sklearn Decision Tree accuracy"""

data= pd.read_csv("data.csv")

"""Train a decision tree model using the raw accelerometer data"""
X= data.drop(['subject','activity'],axis=1)
y= data['activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

decision_tree_model= DecisionTreeClassifier(random_state=42)

decision_tree_model.fit(X_train, y_train)

y_pred= decision_tree_model.predict(X_test)     # a row vector numpy array
y_pred= pd.Series(y_pred)                       # convert to pandas series

acc= accuracy_score(y_pred, y_test)
print(f"Accuracy of the Decision Tree model: {acc:.2f}")

Accuracy of the Decision Tree model: 0.71


In [7]:
"""What does the model classify when given input from an entirely new activity that it hasn't seen before?"""

# Let us assume that the new activity is "Jogging"
new_activity_data= np.array([0.1, 0.2, 0.3])

"""Few Shot Learning"""
# System Prompts 
query = f"""
* You are a machine learning classsifier model(Real Input Discrete output). 
* Based on the accerlerometer values(accx, accy, accz) you have to predict the human activity.
* Activities can be among the following: Walking, Walking_Upstairs, Walking_Downstairs, Sitting, Standing, Laying.
* activity_labels = ["WALKING":1,"WALKING_UPSTAIRS":2,"WALKING_DOWNSTAIRS":3,"SITTING":4,"STANDING":5,"LAYING":6]

* You have been trained on the following dataset:
* Training Dataset: {X_train}
* Training Labels: {y_train}

* The test dataset is in the following format: {new_activity_data}

* You have to predict the human activity for the new activity data.
* Kindly output only the activity name and nothing below or above it.
""" 

# To use Groq LLMs 
model_name = "llama3-70b" # We can choose any model from the groq_models dictionary
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)
few_shot_answer = llm.invoke(query).content

print(f"The model classified the new activity as: {few_shot_answer}")

The model classified the new activity as: WALKING


In [107]:
""""Test the Model with Random Data"""

new_data= pd.DataFrame(np.random.rand(1000,3), columns= ['accx', 'accy', 'accz'])
activity_data= np.random.randint(1,7,1000)
new_data['activity']= activity_data

"""Train a decision tree model using the raw accelerometer data"""
X= new_data.drop(['activity'],axis=1)
y= new_data['activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

"""Few Shot Learning"""
# System Prompts 
query = f"""
* You are a machine learning classsifier model(Real Input Discrete output). 
* Based on the accerlerometer values(accx, accy, accz) you have to predict the human activity.
* Activities can be among the following: Walking, Walking_Upstairs, Walking_Downstairs, Sitting, Standing, Laying.
* activity_labels = ["WALKING":1,"WALKING_UPSTAIRS":2,"WALKING_DOWNSTAIRS":3,"SITTING":4,"STANDING":5,"LAYING":6]

* You have been trained on the following dataset:
* Training Dataset: {X_train}
* Training Labels: {y_train}

* The test dataset is in the following format: {X_test}

*PS: Only give out the prediction array for the given dataset without any explanation without anything above or below it.
*PS: Ensure that the prediction array is of the same length as the test dataset.
""" 

# To use Groq LLMs 
model_name = "llama3-70b" # We can choose any model from the groq_models dictionary
llm = ChatGroq(model=groq_models[model_name], api_key=Groq_Token, temperature=0)
few_shot_answer = llm.invoke(query).content

print(few_shot_answer)

[3, 6, 1, 4, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 

In [108]:
""" Acccuracy of Few Shot Learning for Random Data"""

str= few_shot_answer.strip("[]").split(",")
str_not_null= [i for i in str if i !=" "]
y_pred= np.array([int(i) for i in str_not_null])

few_shot_accuracy_score = accuracy_score(y_test, y_pred)

"""The accuracy of the Few Shot Learning model could not be calculated since the model did not 
provide the prediction array in the correct format."""

ValueError: Found input variables with inconsistent numbers of samples: [300, 2493]