In [18]:
import pandas as pd
from scipy.sparse import issparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

In [19]:
df = pd.read_csv("animals.csv")
df.head()

Unnamed: 0,animal,behaviour,weight,habitat
0,bear,climbs trees for honey,304.18,land
1,lion,drinks at waterhole,185.34,land
2,turtle,swims in ponds,8.69,water
3,cat,hisses at strangers,4.77,land
4,fish,swims against current,1.78,water


In [20]:
X = df.drop(["animal"], axis = "columns")
y = df["animal"]

In [21]:
text_col = []
cat_col = []

num_cols = X.select_dtypes(include=["float64", "Int64"]).columns
obj_cols = X.select_dtypes(include=["object"]).columns

for i in obj_cols:
    avg_words = X[i].dropna().astype(str).apply(lambda x: len(x.split())).mean()
    if avg_words > 1:
        text_col.append(i)
    else:
        cat_col.append(i)

text_col

['behaviour']

In [22]:
num_pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="mean")),
        ("scale", StandardScaler())
    ]
)

cat_pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

text_pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="constant", fill_value="")),
        ("flatten", FunctionTransformer(lambda x:x.ravel(), validate=False)),
        ("vectorizer", CountVectorizer())
    ]
)


transformer = [
    ("num", num_pipe, num_cols),
    ("text", text_pipe, text_col),
    ("cat", cat_pipe, cat_col)
]

process = ColumnTransformer(transformers = transformer)

In [23]:
X = process.fit_transform(X)

In [24]:
issparse(X)

True

In [6]:
a = "ayon, lol"
list1 = a.split(", ")
print(list1)

['ayon', 'lol']


In [None]:
lol = [
    ("vai ka bday", "text"),
    ("vai", "tai"),
    ("vai ka baddie", "good")
]

In [36]:
lol

[('sex', 'text'), ('vai', 'tai'), ('vai ka baddie', 'good')]

In [40]:
lol = [
    ("sex", "text"),
    ("vai", "tai"),
    ("vai ka baddie", "good")
]

# Keep the tuple only if "tai" is NOT inside it
lol = [item for item in lol if "tai" not in item]

print(lol)
# Output: [('sex', 'text'), ('vai ka baddie', 'good')]

[('sex', 'text'), ('vai ka baddie', 'good')]


In [21]:
lol

[('sex', 'text'), 'vai', 'vai ka baddie']

In [None]:
transforme