## NLP (Zero-Shot Text Classification)

<img src='https://frenzy86.s3.eu-west-2.amazonaws.com/python/data/sent.png' width=1024>

In [None]:
!pip install textblob -q



In [1]:
# !wget https://frenzy86.s3.eu-west-2.amazonaws.com/python/data/unsupervised_test.csv

path = 'https://frenzy86.s3.eu-west-2.amazonaws.com/python/data/unsupervised_test.csv'

In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(path)
df

Unnamed: 0,text
0,b'insane ( but inspired ) musical about alferd...
1,b'written by david j . schow and john shirley ...
2,"b'miramax "" disinvited "" on-line media from pr..."
3,"b'in "" the astronaut\'s wife , "" charlize ther..."
4,b'one of the 90s\' most unwelcome thriller tre...
...,...
408,"b'hav plenty , as we are told in the beginning..."
409,b'brian de palma\'s snake eyes stars nicolas c...
410,b'contact ( pg ) there\'s a moment late in rob...
411,"b""this is a film that i was inclined to like a..."


In [3]:
# STEP1 DEFINIRE ED APPLICARE LA REGEX

patterns = {
            r'\d+': '',      # rimuove digits (numeri)
            r'[^\w\s]': '',  # Remove punteggiatura e simboli ...,'@!£$%
            r'\b\w{1,2}\b':'',#remove all token less than2 characters
            r'(http|www)[^\s]+':'', # remove website
            r'\s+': ' '      # rimuove tutti i multipli spazi con uno spazio
            }

def clean_column(df, column, patterns):
    for pattern, replacement in patterns.items():
        df[column] = df[column].str.replace(pattern, replacement)
        df[column] = df[column].str.lower() # applica il lower
    return df

In [4]:
df = clean_column(df, 'text', patterns)
df

Unnamed: 0,text
0,binsane but inspired musical about alferd pack...
1,bwritten david schow and john shirley based th...
2,bmiramax disinvited online media from press sc...
3,bin the astronauts wife charlize theron plays ...
4,bone the most unwelcome thriller trends return...
...,...
408,bhav plenty are told the beginning and reminde...
409,bbrian palmas snake eyes stars nicolas cages e...
410,bcontact theres moment late robert zemeckiss c...
411,bthis film that was inclined like the outset t...


In [6]:
from textblob import TextBlob

def classify_text(text):
    # Create a TextBlob object
    blob = TextBlob(text)
    
    # Perform sentiment analysis
    sentiment = blob.sentiment.polarity
    return sentiment

In [11]:
def classification(sentiment):
    if sentiment > 0.1:
        return 'Positive'
    elif sentiment < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [12]:
df['sentiment'] = df['text'].apply(classify_text)
df

Unnamed: 0,text,sentiment,class
0,binsane but inspired musical about alferd pack...,0.126289,Positive
1,bwritten david schow and john shirley based th...,0.256284,Positive
2,bmiramax disinvited online media from press sc...,0.097024,Neutral
3,bin the astronauts wife charlize theron plays ...,0.014480,Neutral
4,bone the most unwelcome thriller trends return...,-0.011162,Neutral
...,...,...,...
408,bhav plenty are told the beginning and reminde...,0.180847,Positive
409,bbrian palmas snake eyes stars nicolas cages e...,-0.056387,Neutral
410,bcontact theres moment late robert zemeckiss c...,0.195740,Positive
411,bthis film that was inclined like the outset t...,-0.004464,Neutral


In [13]:
df['class'] = df['sentiment'].apply(classification)
df

Unnamed: 0,text,sentiment,class
0,binsane but inspired musical about alferd pack...,0.126289,Positive
1,bwritten david schow and john shirley based th...,0.256284,Positive
2,bmiramax disinvited online media from press sc...,0.097024,Neutral
3,bin the astronauts wife charlize theron plays ...,0.014480,Neutral
4,bone the most unwelcome thriller trends return...,-0.011162,Neutral
...,...,...,...
408,bhav plenty are told the beginning and reminde...,0.180847,Positive
409,bbrian palmas snake eyes stars nicolas cages e...,-0.056387,Negative
410,bcontact theres moment late robert zemeckiss c...,0.195740,Positive
411,bthis film that was inclined like the outset t...,-0.004464,Neutral


In [14]:
df['class'].value_counts()

Neutral     223
Positive    159
Negative     31
Name: class, dtype: int64

## scikit-llm

In [None]:
#pip install scikit-llm

At the moment Scikit-LLM is only compatible with some of the OpenAI models. Hence, a user-provided OpenAI API key is required.



In [None]:
from skllm.config import SKLLMConfig
SKLLMConfig.set_openai_key("<YOUR_KEY>")
SKLLMConfig.set_openai_org("<YOUR_ORGANISATION>")

In [None]:
# Zero Shot
from skllm import ZeroShotGPTClassifier
from skllm.datasets import get_classification_dataset

# demo sentiment analysis dataset
# labels: positive, negative, neutral
X, y = get_classification_dataset() 

clf = ZeroShotGPTClassifier(openai_model = "gpt-3.5-turbo")
clf.fit(X, y)
labels = clf.predict(X)

In [None]:
#Training without labeled data

from skllm import ZeroShotGPTClassifier
from skllm.datasets import get_classification_dataset

X, _ = get_classification_dataset()

clf = ZeroShotGPTClassifier()
clf.fit(None, ['positive', 'negative', 'neutral'])
labels = clf.predict(X)